diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9ddcb5ba83..0330939153 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,189 +5,8 @@ executors:
     docker:
       - image: continuumio/miniconda3
     resource_class: large
-  linux-x86_64-gpu:
-    environment:
-      CONDA_ARCH: Linux-x86_64
-    machine:
-      image: linux-cuda-11:default
-    resource_class: gpu.nvidia.medium
-  linux-arm64-cpu:
-    environment:
-      CONDA_ARCH: Linux-aarch64
-    machine:
-      image: ubuntu-2004:current
-    resource_class: arm.medium
-  macosx-arm64-cpu:
-    environment:
-      CONDA_ARCH: MacOSX-arm64
-    macos:
-      xcode: 14.2.0 # minimum supported for M1
-    resource_class: macos.m1.large.gen1
-  windows-x86_64-cpu:
-    machine:
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-    resource_class: windows.medium
 
 jobs:
-  format:
-    docker:
-      - image: ubuntu:22.04
-    steps:
-      - checkout
-      - run:
-          name: Install clang-format
-          command: |
-            apt-get update
-            apt-get install -y git-core clang-format-11
-      - run:
-          name: Verify clang-format
-          command: |
-             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-11 -i
-             if git diff --quiet; then
-               echo "Formatting OK!"
-             else
-               echo "Formatting not OK!"
-               echo "------------------"
-               git --no-pager diff --color
-               exit 1
-             fi
-
-  build_conda:
-    parameters:
-      label:
-        type: string
-        default: ""
-      cuda:
-        type: string
-        default: ""
-      raft:
-        type: string
-        default: ""
-      cuda_archs:
-        type: string
-        default: ""
-      compiler_version:
-        type: string
-        default: ""
-      exec:
-        type: executor
-    executor: << parameters.exec >>
-    environment:
-      OMP_NUM_THREADS: 10
-      PACKAGE_TYPE: <<parameters.label>>
-      CUDA_ARCHS: <<parameters.cuda_archs>>
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            if [ -n "${CONDA_ARCH}" ]
-            then
-              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
-              bash miniconda.sh -b -p $HOME/miniconda
-              ~/miniconda/bin/conda init
-            fi
-      - run:
-          name: Install conda build tools
-          command: |
-            # conda config --set solver libmamba
-            # conda config --set verbosity 3
-            conda update -y -q conda
-            conda install -y -q conda-build
-      - when:
-          condition: << parameters.label >>
-          steps:
-            - run:
-                name: Enable anaconda uploads
-                command: |
-                  conda install -y -q anaconda-client
-                  conda config --set anaconda_upload yes
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU)
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --python 3.11 -c pytorch
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU) w/ anaconda upload
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU)
-                no_output_timeout: 60m
-                command: |
-                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT)
-                no_output_timeout: 60m
-                command: |
-                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia -c rapidsai-nightly -c conda-forge
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia -c rapidsai-nightly -c conda-forge
-
   build_cmake:
     parameters:
       exec:
@@ -195,12 +14,6 @@ jobs:
       opt_level:
         type: string
         default: generic
-      gpu:
-        type: string
-        default: "OFF"
-      raft:
-        type: string
-        default: "OFF"
     executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
@@ -216,27 +29,15 @@ jobs:
               bash miniconda.sh -b -p $HOME/miniconda
               ~/miniconda/bin/conda init
             fi
-      - when:
-          condition:
-            equal: [ "ON", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Configure CUDA
-                command: sudo update-alternatives --set cuda /usr/local/cuda-11.4
       - run:
           name: Set up environment
           command: |
             conda config --set solver libmamba
             conda update -y -q conda
-            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
-      - when:
-          condition:
-            equal: [ "ON", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install libraft
-                command: |
-                  conda install -y -q libraft cuda-version=11.4 -c rapidsai-nightly -c nvidia -c pkgs/main -c conda-forge
+      - run:
+          name: Install env using main channel
+          command: |
+            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
       - run:
           name: Build all targets
           no_output_timeout: 30m
@@ -245,9 +46,9 @@ jobs:
             conda activate
             cmake -B build \
                   -DBUILD_TESTING=ON \
-                  -DBUILD_SHARED_LIBS=OFF \
-                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
-                  -DFAISS_ENABLE_RAFT=<< parameters.raft >> \
+                  -DBUILD_SHARED_LIBS=ON \
+                  -DFAISS_ENABLE_GPU=OFF \
+                  -DFAISS_ENABLE_RAFT=OFF \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
                   -DPYTHON_EXECUTABLE=$(which python) \
@@ -266,38 +67,12 @@ jobs:
           command: |
             cd build/faiss/python
             python setup.py install
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU only)
-                command: |
-                  conda install -y -q pytorch -c pytorch
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - when:
-          condition:
-            equal: [ "ON", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU + GPU)
-                command: |
-                  conda install -y -q pytorch pytorch-cuda=11 -c pytorch -c nvidia
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-                  cp tests/common_faiss_tests.py faiss/gpu/test
-                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
-                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
-      - when:
-          condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
-          steps:
-            - run:
-                name: Test avx2 loading
-                command: |
-                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
-                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
+      - run:
+          name: Python tests (CPU only)
+          command: |
+            conda install -y -q pytorch -c pytorch
+            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
       - store_test_results:
           path: test-results
 
@@ -305,136 +80,7 @@ workflows:
   version: 2
   build:
     jobs:
-      - format:
-          name: Format
       - build_cmake:
-          name: Linux x86_64 (cmake)
-          exec: linux-x86_64-cpu
-      - build_cmake:
-          name: Linux x86_64 AVX2 (cmake)
-          exec: linux-x86_64-cpu
-          opt_level: "avx2"
-      - build_cmake:
-          name: Linux x86_64 GPU (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          requires:
-            - Linux x86_64 AVX2 (cmake)
-      - build_cmake:
-          name: Linux x86_64 GPU w/ RAFT (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          raft: "ON"
-          requires:
-            - Linux x86_64 GPU (cmake)
-      - build_conda:
-          name: Linux x86_64 (conda)
-          exec: linux-x86_64-cpu
-      - build_conda:
-          name: Windows x86_64 (conda)
-          exec: windows-x86_64-cpu
-      - build_conda:
-          name: Linux arm64 (conda)
-          exec: linux-arm64-cpu
-      - build_conda:
-          name: Linux x86_64 packages
-          exec: linux-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 11.4)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "11.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.4)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "11.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Windows x86_64 packages
-          exec: windows-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: OSX arm64 packages
-          exec: macosx-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux arm64 packages
-          exec: linux-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-
-  nightly:
-    triggers:
-      - schedule:
-          cron: "0 0 * * *"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - build_conda:
-          name: Linux x86_64 nightlies
+          name: Linux x86_64 AVX512 (cmake)
           exec: linux-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 11.4)
-          exec: linux-x86_64-gpu
-          cuda: "11.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          label: nightly
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.4)
-          exec: linux-x86_64-gpu
-          raft: "ON"
-          cuda: "11.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          label: nightly
-      - build_conda:
-          name: Windows x86_64 nightlies
-          exec: windows-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: OSX arm64 nightlies
-          exec: macosx-arm64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux arm64 nightlies
-          exec: linux-arm64-cpu
-          label: nightly
+          opt_level: "avx512"
diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
new file mode 100644
index 0000000000..2bc476add5
--- /dev/null
+++ b/.github/actions/build_cmake/action.yml
@@ -0,0 +1,105 @@
+name: Build cmake
+inputs:
+  opt_level:
+    description: 'Compile options / optimization level.'
+    required: false
+    default: generic
+  gpu:
+    description: 'Enable GPU support.'
+    required: false
+    default: OFF
+  raft:
+    description: 'Enable RAFT support.'
+    required: false
+    default: OFF
+runs:
+  using: composite
+  steps:
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version: latest
+    - name: Configure build environment
+      shell: bash
+      run: |
+        # initialize Conda
+        conda config --set solver libmamba
+        conda update -y -q conda
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        # install base packages
+        conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
+
+        # install CUDA packages
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+        fi
+
+        # install RAFT packages
+        if [ "${{ inputs.raft }}" = "ON" ]; then
+          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        fi
+
+        # install test packages
+        conda install -y pytest
+        if [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        else
+          conda install -y -q pytorch -c pytorch
+        fi
+    - name: Build all targets
+      shell: bash
+      run: |
+        eval "$(conda shell.bash hook)"
+        conda activate
+        cmake -B build \
+              -DBUILD_TESTING=ON \
+              -DBUILD_SHARED_LIBS=ON \
+              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
+              -DFAISS_ENABLE_RAFT=${{ inputs.raft }} \
+              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
+              -DFAISS_ENABLE_C_API=ON \
+              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBLA_VENDOR=Intel10_64_dyn \
+              -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+              .
+        make -k -C build -j$(nproc)
+    - name: C++ tests
+      shell: bash
+      run: |
+        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
+        make -C build test
+    - name: Install Python extension
+      shell: bash
+      working-directory: build/faiss/python
+      run: |
+        $CONDA/bin/python setup.py install
+    - name: Python tests (CPU only)
+      if: inputs.gpu == 'OFF'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+    - name: Python tests (CPU + GPU)
+      if: inputs.gpu == 'ON'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+        cp tests/common_faiss_tests.py faiss/gpu/test
+        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+    - name: Test avx2 loading
+      if: inputs.opt_level == 'avx2'
+      shell: bash
+      run: |
+        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
+        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
+        path: test-results
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
new file mode 100644
index 0000000000..982430c351
--- /dev/null
+++ b/.github/actions/build_conda/action.yml
@@ -0,0 +1,96 @@
+name: Conda build
+description: Builds FAISS inside a Conda environment and uploads to repository when label is provided.
+inputs:
+  label:
+    description: "The label to be used for uploads to Conda."
+    default: ""
+    required: false
+  cuda:
+    description: "CUDA toolkit version to use."
+    default: ""
+    required: false
+  raft:
+    description: "Enable RAFT support."
+    default: ""
+    required: false
+  compiler_version:
+    description: "compiler_version"
+    default: "Compiler version for C/C++/CUDA."
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Choose shell
+      shell: bash
+      id: choose_shell
+      run: |
+        # Use pwsh on Windows; bash everywhere else
+        if [ "${{ runner.os }}" != "Windows" ]; then
+          echo "shell=bash" >> "$GITHUB_OUTPUT"
+        else
+          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version:  latest
+    - name: Install conda build tools
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        conda update -y -q conda
+        conda install -y -q conda-build
+    - name: Enable anaconda uploads
+      if: inputs.label != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda install -y -q anaconda-client
+        conda config --set anaconda_upload yes
+    - name: Conda build (CPU)
+      if: inputs.label == '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss --python 3.11 -c pytorch
+    - name: Conda build (CPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
+    - name: Conda build (GPU)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU w/ RAFT)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+    - name: Conda build (GPU w/ RAFT) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000000..bd415dfce8
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,244 @@
+name: Build
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    tags:
+      - 'v*'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  format:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install clang-format
+        run: |
+            sudo apt-get update -y
+            sudo apt-get install -y wget
+            sudo apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            sudo ./llvm.sh 18
+            sudo apt-get install -y git-core clang-format-18
+      - name: Verify clang-format
+        run: |
+            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
+            if git diff --quiet; then
+              echo "Formatting OK!"
+            else
+              echo "Formatting not OK!"
+              echo "------------------"
+              git --no-pager diff --color
+              exit 1
+            fi
+  linux-x86_64-cmake:
+    name: Linux x86_64 (cmake)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+  linux-x86_64-AVX2-cmake:
+    name: Linux x86_64 AVX2 (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx2
+  linux-x86_64-AVX512-cmake:
+    name: Linux x86_64 AVX512 (cmake)
+    if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512
+  linux-x86_64-GPU-cmake:
+    name: Linux x86_64 GPU (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+  linux-x86_64-GPU-w-RAFT-cmake:
+    name: Linux x86_64 GPU w/ RAFT (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          raft: ON
+  linux-x86_64-conda:
+    name: Linux x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  windows-x86_64-conda:
+    name: Windows x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-arm64-conda:
+    name: Linux arm64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-x86_64-packages:
+    name: Linux x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-x86_64-GPU-packages-CUDA-11-4-4:
+    name: Linux x86_64 GPU packages (CUDA 11.4.4)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-packages-CUDA-12-1-1:
+    name: Linux x86_64 GPU packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-packages:
+    name: Windows x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  osx-arm64-packages:
+    name: OSX arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-arm64-packages:
+    name: Linux arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 0000000000..eabee07744
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,139 @@
+name: Nightly
+on:
+  schedule:
+    - cron:  '10 1 * * *'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-nightly:
+    name: Linux x86_64 nightlies
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-x86_64-GPU-CUDA-11-4-4-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-CUDA-12-1-1-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA12-1-1-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-nightly:
+    name: Windows x86_64 nightlies
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  osx-arm64-nightly:
+    name: OSX arm64 nightlies
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-arm64-nightly:
+    name: Linux arm64 nightlies
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
diff --git a/.gitignore b/.gitignore
index 183e2066e1..d6df432fa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 *.pyc
 *~
+/build/
 /config.*
 /aclocal.m4
 /autom4te.cache/
@@ -17,3 +18,4 @@
 /tests/test
 /tests/gtest/
 faiss/python/swigfaiss_avx2.swig
+faiss/python/swigfaiss_avx512.swig
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a38f4e2d5..8d289ec2f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,36 @@ We try to indicate most contributions here with the contributor names who are no
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
+### Changed
+- Previously, when moving indices to GPU with coarse quantizers that were not implemented on GPU, the cloner would silently fallback to CPU. This version will now throw an exception instead and the calling code would need to explicitly allow fallback to CPU by setting a flag in cloner config.
+
+## [1.8.0] - 2024-02-27
+### Added
+- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
+- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
+- Added a context parameter to InvertedLists and InvertedListsIterator
+- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
+- Introduced Offline IVF framework powered by Faiss big batch search
+- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
+- Generalized ResultHandler and supported range search for HNSW and FastScan
+- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
+- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
+- Supported large two-level clustering
+- Added support for Python 3.11 and 3.12
+- Added support for CUDA 12
+
+### Changed
+- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
+- Splitted off RQ encoding steps to another file
+- Supported better NaN handling
+- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
+
+### Fixed
+- Fixed DeviceVector reallocations in Faiss GPU
+- Used efSearch from params if provided in HNSW search
+- Fixed warp synchronous behavior in Faiss GPU CUDA 12
+
+
 ## [1.7.4] - 2023-04-12
 ### Added
 - Added big batch IVF search for conducting efficient search with big batches of queries
@@ -258,7 +288,8 @@ by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.4...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.8.0...HEAD
+[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
 [1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
 [1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
 [1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85c8a820bc..cedee9c456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
 
 set(FAISS_LANGUAGES CXX)
 
@@ -40,7 +40,7 @@ rapids_cuda_init_architectures(faiss_c_library)
 endif()
 
 project(faiss
-  VERSION 1.7.4
+  VERSION 1.8.0
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
   LANGUAGES ${FAISS_LANGUAGES})
@@ -62,9 +62,9 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT)
-  find_package(raft COMPONENTS compiled distributed)
-endif()
+if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
+   find_package(raft COMPONENTS compiled distributed)
+ endif()
 
 add_subdirectory(faiss)
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5ef204b946..10fc8152f6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -42,7 +42,7 @@ outlined on that page and do not file a public issue.
 
 ## Coding Style
 
-* 4 or 2 spaces for indentation in C++ (no tabs)
+* 4 spaces for indentation in C++ (no tabs)
 * 80 character line length (both for C++ and Python)
 * C++ language level: C++17
 
diff --git a/INSTALL.md b/INSTALL.md
index dd04511bd2..5bd4f6d448 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,47 +1,50 @@
 # Installing Faiss via conda
 
-The recommended way to install Faiss is through [conda](https://docs.conda.io).
+The supported way to install Faiss is through [conda](https://docs.conda.io).
 Stable releases are pushed regularly to the pytorch conda channel, as well as
 pre-release nightly builds.
 
-The CPU-only `faiss-cpu` conda package is currently available on Linux, OSX, and
-Windows. The `faiss-gpu`, containing both CPU and GPU indices, is available on
-Linux systems, for CUDA 11.4. Packages are built for Python versions 3.8-3.10.
+- The CPU-only faiss-cpu conda package is currently available on Linux (x86_64 and arm64), OSX (arm64 only), and Windows (x86_64)
+- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86_64 only) for CUDA 11.4 and 12.1
+- NEW: faiss-gpu-raft containing both CPU and GPU indices provided by NVIDIA RAFT, is available on Linux (x86_64 only) for CUDA 11.8 and 12.1.
 
 To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch faiss-cpu=1.8.0
 
 # GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0
 ```
 
-For faiss-gpu, the nvidia channel is required for cudatoolkit=11.4, which is not
+For faiss-gpu, the nvidia channel is required for CUDA, which is not
 published in the main anaconda channel.
 
-NOTE: due to a bug in the latest 1.7.4 release, Intel MKL 2021 needs to be installed
-separately where applicable. Remove the MKL reference when installing on
-non-Intel platforms.
+For faiss-gpu-raft, the nvidia, rapidsai and conda-forge channels are required.
 
-Nightly pre-release packages can be installed as follows. There is no need to
-install MKL separately, the correct package is automatically installed as a
-dependency where necessary:
+Nightly pre-release packages can be installed as follows:
 
 ``` shell
 # CPU-only version
 $ conda install -c pytorch/label/nightly faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0 pytorch pytorch-cuda numpy
 ```
+In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
 
-A combination of versions that installs GPU Faiss with CUDA 11.4 and Pytorch (as of 2023-06-19):
+A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
 ```
-conda create --name faiss_1.7.4 python=3.10
-conda activate faiss_1.7.4
-conda install faiss-gpu=1.7.4 mkl=2021 pytorch pytorch-cuda numpy -c pytorch -c nvidia
+conda create --name faiss_1.8.0
+conda activate faiss_1.8.0
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
 ```
 
 ## Installing from conda-forge
@@ -77,7 +80,7 @@ found to run on other platforms as well, see
 
 The basic requirements are:
 - a C++17 compiler (with support for OpenMP support version 2 or higher),
-- a BLAS implementation (we strongly recommend using Intel MKL for best
+- a BLAS implementation (on Intel machines we strongly recommend using Intel MKL for best
 performance).
 
 The optional requirements are:
diff --git a/README.md b/README.md
index 0db380b807..bd1cf33a68 100644
--- a/README.md
+++ b/README.md
@@ -49,11 +49,22 @@ The main authors of Faiss are:
 - [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
 - [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
 - [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
+- [Gergely Szilvasy](https://github.com/algoriddle) build system, benchmarking framework.
 
 ## Reference
 
-Reference to cite when you use Faiss in a research paper:
-
+References to cite when you use Faiss in a research paper:
+```
+@article{douze2024faiss,
+      title={The Faiss library},
+      author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazaré and Maria Lomeli and Lucas Hosseini and Hervé Jégou},
+      year={2024},
+      eprint={2401.08281},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+For the GPU version of Faiss, please cite:
 ```
 @article{johnson2019billion,
   title={Billion-scale similarity search with {GPUs}},
diff --git a/benchs/bench_cppcontrib_sa_decode.cpp b/benchs/bench_cppcontrib_sa_decode.cpp
index c5c6b0bf18..b960fb7c6a 100644
--- a/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/benchs/bench_cppcontrib_sa_decode.cpp
@@ -213,10 +213,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -262,11 +261,9 @@ static void verifyIndex2LevelDecoder(
 
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -326,10 +323,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -356,10 +352,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -388,10 +383,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -423,10 +417,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -462,10 +455,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -531,10 +523,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -581,10 +572,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -650,10 +640,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -685,10 +674,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -722,10 +710,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -762,10 +749,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -806,10 +792,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -865,10 +850,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -914,10 +898,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -977,10 +960,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1006,10 +988,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1036,10 +1017,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1070,10 +1050,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1106,10 +1085,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1170,10 +1148,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1219,10 +1196,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1287,10 +1263,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1321,10 +1296,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1356,10 +1330,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1395,10 +1368,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1436,10 +1408,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1512,14 +1483,11 @@ int main(int argc, char** argv) {
             (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
 
     // print the header
-    std::cout << "Codec\t"
-              << "n\t"
-              << "d\t"
-              << "Experiment\t"
-              << "Iterations\t"
-              << "Faiss time\t"
-              << "SADecodeKernel time\t"
-              << "Error" << std::endl;
+    auto delim = "\t";
+    std::cout << "Codec" << delim << "n" << delim << "d" << delim
+              << "Experiment" << delim << "Iterations" << delim << "Faiss time"
+              << delim << "SADecodeKernel time" << delim << "Error"
+              << std::endl;
 
     // The following experiment types are available:
     // * store_seq - decode a contiguous block of codes into vectors, one by one
diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py
index 8ee53103e5..8ca68c4cd8 100644
--- a/benchs/bench_fw/benchmark.py
+++ b/benchs/bench_fw/benchmark.py
@@ -4,23 +4,23 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+from copy import copy
 from dataclasses import dataclass
 from operator import itemgetter
-from statistics import median, mean
+from statistics import mean, median
 from typing import Any, Dict, List, Optional
 
-from .index import Index, IndexFromCodec, IndexFromFactory
-from .descriptors import DatasetDescriptor, IndexDescriptor
-
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
-from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
-    knn_intersection_measure,
-)
 
 import numpy as np
 
 from scipy.optimize import curve_fit
 
+from .descriptors import DatasetDescriptor, IndexDescriptor
+from .index import Index, IndexFromCodec, IndexFromFactory
+
+from .utils import dict_merge
+
 logger = logging.getLogger(__name__)
 
 
@@ -90,26 +90,18 @@ def optimizer(op, search, cost_metric, perf_metric):
             continue
 
         logger.info(f"{cno=:4d} {str(parameters):50}: RUN")
-        cost, perf = search(
+        cost, perf, requires = search(
             parameters,
             cost_metric,
             perf_metric,
         )
+        if requires is not None:
+            return requires
         logger.info(
             f"{cno=:4d} {str(parameters):50}: DONE, {cost=:.3f} {perf=:.3f}"
         )
         op.add_operating_point(key, perf, cost)
-
-
-def distance_ratio_measure(I, R, D_GT, metric):
-    sum_of_R = np.sum(np.where(I >= 0, R, 0))
-    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
-    if metric == faiss.METRIC_INNER_PRODUCT:
-        return (sum_of_R / sum_of_D_GT).item()
-    elif metric == faiss.METRIC_L2:
-        return (sum_of_D_GT / sum_of_R).item()
-    else:
-        raise RuntimeError(f"unknown metric {metric}")
+    return None
 
 
 # range_metric possible values:
@@ -194,6 +186,7 @@ def sigmoid(x, a, b, c):
 
 @dataclass
 class Benchmark:
+    num_threads: int
     training_vectors: Optional[DatasetDescriptor] = None
     database_vectors: Optional[DatasetDescriptor] = None
     query_vectors: Optional[DatasetDescriptor] = None
@@ -215,9 +208,11 @@ def set_io(self, benchmark_io):
         self.io.distance_metric = self.distance_metric
         self.io.distance_metric_type = self.distance_metric_type
 
-    def get_index_desc(self, factory: str) -> Optional[IndexDescriptor]:
+    def get_index_desc(self, factory_or_codec: str) -> Optional[IndexDescriptor]:
         for desc in self.index_descs:
-            if desc.factory == factory:
+            if desc.factory == factory_or_codec:
+                return desc
+            if desc.codec_alias == factory_or_codec:
                 return desc
         return None
 
@@ -233,12 +228,13 @@ def range_search_reference(self, index, parameters, range_metric):
         else:
             m_radius = range_metric
 
-        lims, D, I, R, P = self.range_search(
+        lims, D, I, R, P, _ = self.range_search(
+            False,
             index,
             parameters,
             radius=m_radius,
         )
-        flat = index.factory == "Flat"
+        flat = index.is_flat_index()
         (
             gt_radius,
             range_search_metric_function,
@@ -258,7 +254,8 @@ def range_search_reference(self, index, parameters, range_metric):
         )
 
     def estimate_range(self, index, parameters, range_scoring_radius):
-        D, I, R, P = index.knn_search(
+        D, I, R, P, _ = index.knn_search(
+            False,
             parameters,
             self.query_vectors,
             self.k,
@@ -275,10 +272,13 @@ def estimate_range(self, index, parameters, range_scoring_radius):
 
     def range_search(
         self,
+        dry_run,
         index: Index,
         search_parameters: Optional[Dict[str, int]],
         radius: Optional[float] = None,
         gt_radius: Optional[float] = None,
+        range_search_metric_function=None,
+        gt_rsm=None,
     ):
         logger.info("range_search: begin")
         if radius is None:
@@ -293,16 +293,32 @@ def range_search(
                 )
             )
         logger.info(f"Radius={radius}")
-        return index.range_search(
+        lims, D, I, R, P, requires = index.range_search(
+            dry_run=dry_run,
             search_parameters=search_parameters,
             query_vectors=self.query_vectors,
             radius=radius,
         )
+        if requires is not None:
+            return None, None, None, None, None, requires
+        if range_search_metric_function is not None:
+            range_search_metric = range_search_metric_function(R)
+            range_search_pr = range_search_pr_curve(
+                D, range_search_metric, gt_rsm
+            )
+            range_score_sum = np.sum(range_search_metric).item()
+            P |= {
+                "range_score_sum": range_score_sum,
+                "range_score_max_recall": range_score_sum / gt_rsm,
+                "range_search_pr": range_search_pr,
+            }
+        return lims, D, I, R, P, requires
 
     def range_ground_truth(self, gt_radius, range_search_metric_function):
         logger.info("range_ground_truth: begin")
         flat_desc = self.get_index_desc("Flat")
-        lims, D, I, R, P = self.range_search(
+        lims, D, I, R, P, _ = self.range_search(
+            False,
             flat_desc.index,
             search_parameters=None,
             radius=gt_radius,
@@ -311,166 +327,320 @@ def range_ground_truth(self, gt_radius, range_search_metric_function):
         logger.info("range_ground_truth: end")
         return gt_rsm
 
-    def range_search_benchmark(
+    def knn_ground_truth(self):
+        logger.info("knn_ground_truth: begin")
+        flat_desc = self.get_index_desc("Flat")
+        self.build_index_wrapper(flat_desc)
+        (
+            self.gt_knn_D,
+            self.gt_knn_I,
+            _,
+            _,
+            requires,
+        ) = flat_desc.index.knn_search(
+            dry_run=False,
+            search_parameters=None,
+            query_vectors=self.query_vectors,
+            k=self.k,
+        )
+        assert requires is None
+        logger.info("knn_ground_truth: end")
+
+    def search_benchmark(
         self,
+        name,
+        search_func,
+        key_func,
+        cost_metrics,
+        perf_metrics,
         results: Dict[str, Any],
         index: Index,
-        metric_key: str,
-        radius: float,
-        gt_radius: float,
-        range_search_metric_function,
-        gt_rsm: float,
     ):
-        logger.info(f"range_search_benchmark: begin {index.get_index_name()}")
+        index_name = index.get_index_name()
+        logger.info(f"{name}_benchmark: begin {index_name}")
 
         def experiment(parameters, cost_metric, perf_metric):
             nonlocal results
-            key = index.get_range_search_name(
-                search_parameters=parameters,
-                query_vectors=self.query_vectors,
-                radius=radius,
-            )
-            key += metric_key
+            key = key_func(parameters)
             if key in results["experiments"]:
                 metrics = results["experiments"][key]
             else:
-                lims, D, I, R, P = self.range_search(
-                    index,
-                    parameters,
-                    radius=radius,
-                    gt_radius=gt_radius,
-                )
-                range_search_metric = range_search_metric_function(R)
-                range_search_pr = range_search_pr_curve(
-                    D, range_search_metric, gt_rsm
-                )
-                range_score_sum = np.sum(range_search_metric).item()
-                metrics = P | {
-                    "range_score_sum": range_score_sum,
-                    "range_score_max_recall": range_score_sum / gt_rsm,
-                    "range_search_pr": range_search_pr,
-                }
+                metrics, requires = search_func(parameters)
+                if requires is not None:
+                    return None, None, requires
                 results["experiments"][key] = metrics
-            return metrics[cost_metric], metrics[perf_metric]
+            return metrics[cost_metric], metrics[perf_metric], None
 
-        for cost_metric in ["time"]:
-            for perf_metric in ["range_score_max_recall"]:
+        for cost_metric in cost_metrics:
+            for perf_metric in perf_metrics:
                 op = index.get_operating_points()
-                optimizer(
+                requires = optimizer(
                     op,
                     experiment,
                     cost_metric,
                     perf_metric,
                 )
-        logger.info("range_search_benchmark: end")
-        return results
+                if requires is not None:
+                    break
+        logger.info(f"{name}_benchmark: end")
+        return results, requires
 
-    def knn_ground_truth(self):
-        logger.info("knn_ground_truth: begin")
-        flat_desc = self.get_index_desc("Flat")
-        self.gt_knn_D, self.gt_knn_I, _, _ = flat_desc.index.knn_search(
-            search_parameters=None,
-            query_vectors=self.query_vectors,
-            k=self.k,
+    def knn_search_benchmark(
+        self, dry_run, results: Dict[str, Any], index: Index
+    ):
+        return self.search_benchmark(
+            name="knn_search",
+            search_func=lambda parameters: index.knn_search(
+                dry_run,
+                parameters,
+                self.query_vectors,
+                self.k,
+                self.gt_knn_I,
+                self.gt_knn_D,
+            )[3:],
+            key_func=lambda parameters: index.get_knn_search_name(
+                search_parameters=parameters,
+                query_vectors=self.query_vectors,
+                k=self.k,
+                reconstruct=False,
+            ),
+            cost_metrics=["time"],
+            perf_metrics=["knn_intersection", "distance_ratio"],
+            results=results,
+            index=index,
         )
-        logger.info("knn_ground_truth: end")
-
-    def knn_search_benchmark(self, results: Dict[str, Any], index: Index):
-        index_name = index.get_index_name()
-        logger.info(f"knn_search_benchmark: begin {index_name}")
 
-        def experiment(parameters, cost_metric, perf_metric):
-            nonlocal results
-            key = index.get_knn_search_name(
+    def reconstruct_benchmark(
+        self, dry_run, results: Dict[str, Any], index: Index
+    ):
+        return self.search_benchmark(
+            name="reconstruct",
+            search_func=lambda parameters: index.reconstruct(
+                dry_run,
                 parameters,
                 self.query_vectors,
                 self.k,
+                self.gt_knn_I,
+            ),
+            key_func=lambda parameters: index.get_knn_search_name(
+                search_parameters=parameters,
+                query_vectors=self.query_vectors,
+                k=self.k,
+                reconstruct=True,
+            ),
+            cost_metrics=["encode_time"],
+            perf_metrics=["sym_recall"],
+            results=results,
+            index=index,
+        )
+
+    def range_search_benchmark(
+        self,
+        dry_run,
+        results: Dict[str, Any],
+        index: Index,
+        metric_key: str,
+        radius: float,
+        gt_radius: float,
+        range_search_metric_function,
+        gt_rsm: float,
+    ):
+        return self.search_benchmark(
+            name="range_search",
+            search_func=lambda parameters: self.range_search(
+                dry_run=dry_run,
+                index=index,
+                search_parameters=parameters,
+                radius=radius,
+                gt_radius=gt_radius,
+                range_search_metric_function=range_search_metric_function,
+                gt_rsm=gt_rsm,
+            )[4:],
+            key_func=lambda parameters: index.get_range_search_name(
+                search_parameters=parameters,
+                query_vectors=self.query_vectors,
+                radius=radius,
             )
-            key += "knn"
-            if key in results["experiments"]:
-                metrics = results["experiments"][key]
-            else:
-                D, I, R, P = index.knn_search(
-                    parameters, self.query_vectors, self.k
-                )
-                metrics = P | {
-                    "knn_intersection": knn_intersection_measure(
-                        I, self.gt_knn_I
-                    ),
-                    "distance_ratio": distance_ratio_measure(
-                        I, R, self.gt_knn_D, self.distance_metric_type
-                    ),
-                }
-                results["experiments"][key] = metrics
-            return metrics[cost_metric], metrics[perf_metric]
+            + metric_key,
+            cost_metrics=["time"],
+            perf_metrics=["range_score_max_recall"],
+            results=results,
+            index=index,
+        )
 
-        for cost_metric in ["time"]:
-            for perf_metric in ["knn_intersection", "distance_ratio"]:
-                op = index.get_operating_points()
-                optimizer(
-                    op,
-                    experiment,
-                    cost_metric,
-                    perf_metric,
-                )
-        logger.info("knn_search_benchmark: end")
-        return results
+    def build_index_wrapper(self, index_desc: IndexDescriptor):
+        if hasattr(index_desc, "index"):
+            return
+        if index_desc.factory is not None:
+            training_vectors = copy(self.training_vectors)
+            if index_desc.training_size is not None:
+                training_vectors.num_vectors = index_desc.training_size
+            index = IndexFromFactory(
+                num_threads=self.num_threads,
+                d=self.d,
+                metric=self.distance_metric,
+                database_vectors=self.database_vectors,
+                search_params=index_desc.search_params,
+                construction_params=index_desc.construction_params,
+                factory=index_desc.factory,
+                training_vectors=training_vectors,
+            )
+        else:
+            index = IndexFromCodec(
+                num_threads=self.num_threads,
+                d=self.d,
+                metric=self.distance_metric,
+                database_vectors=self.database_vectors,
+                search_params=index_desc.search_params,
+                construction_params=index_desc.construction_params,
+                path=index_desc.path,
+                bucket=index_desc.bucket,
+            )
+        index.set_io(self.io)
+        index_desc.index = index
+
+    def clone_one(self, index_desc):
+        benchmark = Benchmark(
+            num_threads=self.num_threads,
+            training_vectors=self.training_vectors,
+            database_vectors=self.database_vectors,
+            query_vectors=self.query_vectors,
+            index_descs=[self.get_index_desc("Flat"), index_desc],
+            range_ref_index_desc=self.range_ref_index_desc,
+            k=self.k,
+            distance_metric=self.distance_metric,
+        )
+        benchmark.set_io(self.io.clone())
+        return benchmark
 
-    def train(self, results):
-        xq = self.io.get_dataset(self.query_vectors)
-        self.d = xq.shape[1]
-        if self.get_index_desc("Flat") is None:
-            self.index_descs.append(IndexDescriptor(factory="Flat"))
-        for index_desc in self.index_descs:
-            if index_desc.factory is not None:
-                index = IndexFromFactory(
-                    d=self.d,
-                    metric=self.distance_metric,
-                    database_vectors=self.database_vectors,
-                    search_params=index_desc.search_params,
-                    construction_params=index_desc.construction_params,
-                    factory=index_desc.factory,
-                    training_vectors=self.training_vectors,
-                )
-                index.set_io(self.io)
-                index.train()
-                index_desc.index = index
-                results["indices"][index.get_codec_name()] = {
-                    "code_size": index.get_code_size()
-                }
+    def benchmark_one(
+        self,
+        dry_run,
+        results: Dict[str, Any],
+        index_desc: IndexDescriptor,
+        train,
+        reconstruct,
+        knn,
+        range,
+    ):
+        faiss.omp_set_num_threads(self.num_threads)
+        if not dry_run:
+            self.knn_ground_truth()
+        self.build_index_wrapper(index_desc)
+        meta, requires = index_desc.index.fetch_meta(dry_run=dry_run)
+        if requires is not None:
+            return results, (requires if train else None)
+        results["indices"][index_desc.index.get_codec_name()] = meta
+
+        # results, requires = self.reconstruct_benchmark(
+        #     dry_run=True,
+        #     results=results,
+        #     index=index_desc.index,
+        # )
+        # if reconstruct and requires is not None:
+        #     if dry_run:
+        #         return results, requires
+        #     else:
+        #         results, requires = self.reconstruct_benchmark(
+        #             dry_run=False,
+        #             results=results,
+        #             index=index_desc.index,
+        #         )
+        #         assert requires is None
+
+        results, requires = self.knn_search_benchmark(
+            dry_run=True,
+            results=results,
+            index=index_desc.index,
+        )
+        if knn and requires is not None:
+            if dry_run:
+                return results, requires
             else:
-                index = IndexFromCodec(
-                    d=self.d,
-                    metric=self.distance_metric,
-                    database_vectors=self.database_vectors,
-                    search_params=index_desc.search_params,
-                    construction_params=index_desc.construction_params,
-                    path=index_desc.path,
-                    bucket=index_desc.bucket,
+                results, requires = self.knn_search_benchmark(
+                    dry_run=False,
+                    results=results,
+                    index=index_desc.index,
                 )
-                index.set_io(self.io)
-                index_desc.index = index
-                results["indices"][index.get_codec_name()] = {
-                    "code_size": index.get_code_size()
-                }
-        return results
+                assert requires is None
+
+        if (
+            self.range_ref_index_desc is None
+            or not index_desc.index.supports_range_search()
+        ):
+            return results, None
+
+        ref_index_desc = self.get_index_desc(self.range_ref_index_desc)
+        if ref_index_desc is None:
+            raise ValueError(
+                f"Unknown range index {self.range_ref_index_desc}"
+            )
+        if ref_index_desc.range_metrics is None:
+            raise ValueError(
+                f"Range index {ref_index_desc.factory} has no radius_score"
+            )
+        for metric_key, range_metric in ref_index_desc.range_metrics.items():
+            (
+                gt_radius,
+                range_search_metric_function,
+                coefficients,
+                coefficients_training_data,
+            ) = self.range_search_reference(
+                ref_index_desc.index,
+                ref_index_desc.search_params,
+                range_metric,
+            )
+            gt_rsm = self.range_ground_truth(
+                gt_radius, range_search_metric_function
+            )
+            results, requires = self.range_search_benchmark(
+                dry_run=True,
+                results=results,
+                index=index_desc.index,
+                metric_key=metric_key,
+                radius=index_desc.radius,
+                gt_radius=gt_radius,
+                range_search_metric_function=range_search_metric_function,
+                gt_rsm=gt_rsm,
+            )
+            if range and requires is not None:
+                if dry_run:
+                    return results, requires
+                else:
+                    results, requires = self.range_search_benchmark(
+                        dry_run=False,
+                        results=results,
+                        index=index_desc.index,
+                        metric_key=metric_key,
+                        radius=index_desc.radius,
+                        gt_radius=gt_radius,
+                        range_search_metric_function=range_search_metric_function,
+                        gt_rsm=gt_rsm,
+                    )
+                    assert requires is None
 
-    def benchmark(self, result_file=None):
+        return results, None
+
+    def benchmark(
+        self,
+        result_file=None,
+        local=False,
+        train=False,
+        reconstruct=False,
+        knn=False,
+        range=False,
+    ):
         logger.info("begin evaluate")
 
-        faiss.omp_set_num_threads(24)
+        faiss.omp_set_num_threads(self.num_threads)
         results = {"indices": {}, "experiments": {}}
-        results = self.train(results)
+        xq = self.io.get_dataset(self.query_vectors)
+        self.d = xq.shape[1]
+        if self.get_index_desc("Flat") is None:
+            self.index_descs.append(IndexDescriptor(factory="Flat"))
 
-        # knn search
         self.knn_ground_truth()
-        for index_desc in self.index_descs:
-            results = self.knn_search_benchmark(
-                results=results,
-                index=index_desc.index,
-            )
 
-        # range search
         if self.range_ref_index_desc is not None:
             index_desc = self.get_index_desc(self.range_ref_index_desc)
             if index_desc is None:
@@ -482,6 +652,7 @@ def benchmark(self, result_file=None):
                     f"Range index {index_desc.factory} has no radius_score"
                 )
             results["metrics"] = {}
+            self.build_index_wrapper(index_desc)
             for metric_key, range_metric in index_desc.range_metrics.items():
                 (
                     gt_radius,
@@ -498,19 +669,77 @@ def benchmark(self, result_file=None):
                 gt_rsm = self.range_ground_truth(
                     gt_radius, range_search_metric_function
                 )
-                for index_desc in self.index_descs:
-                    if not index_desc.index.supports_range_search():
-                        continue
-                    results = self.range_search_benchmark(
-                        results=results,
-                        index=index_desc.index,
-                        metric_key=metric_key,
-                        radius=index_desc.radius,
-                        gt_radius=gt_radius,
-                        range_search_metric_function=range_search_metric_function,
-                        gt_rsm=gt_rsm,
+
+        self.index_descs = list(dict.fromkeys(self.index_descs))
+
+        todo = self.index_descs
+        for index_desc in self.index_descs:
+            index_desc.requires = None
+
+        queued = set()
+        while todo:
+            current_todo = []
+            next_todo = []
+            for index_desc in todo:
+                results, requires = self.benchmark_one(
+                    dry_run=True,
+                    results=results,
+                    index_desc=index_desc,
+                    train=train,
+                    reconstruct=reconstruct,
+                    knn=knn,
+                    range=range,
+                )
+                if requires is None:
+                    continue
+                if requires in queued:
+                    if index_desc.requires != requires:
+                        index_desc.requires = requires
+                        next_todo.append(index_desc)
+                else:
+                    queued.add(requires)
+                    index_desc.requires = requires
+                    current_todo.append(index_desc)
+
+            if current_todo:
+                results_one = {"indices": {}, "experiments": {}}
+                params = [
+                    (
+                        index_desc,
+                        self.clone_one(index_desc),
+                        results_one,
+                        train,
+                        reconstruct,
+                        knn,
+                        range,
                     )
+                    for index_desc in current_todo
+                ]
+                for result in self.io.launch_jobs(
+                    run_benchmark_one, params, local=local
+                ):
+                    dict_merge(results, result)
+
+            todo = next_todo
+
         if result_file is not None:
             self.io.write_json(results, result_file, overwrite=True)
         logger.info("end evaluate")
         return results
+
+
+def run_benchmark_one(params):
+    logger.info(params)
+    index_desc, benchmark, results, train, reconstruct, knn, range = params
+    results, requires = benchmark.benchmark_one(
+        dry_run=False,
+        results=results,
+        index_desc=index_desc,
+        train=train,
+        reconstruct=reconstruct,
+        knn=knn,
+        range=range,
+    )
+    assert requires is None
+    assert results is not None
+    return results
diff --git a/benchs/bench_fw/benchmark_io.py b/benchs/bench_fw/benchmark_io.py
index 370efffce5..b39bb60290 100644
--- a/benchs/bench_fw/benchmark_io.py
+++ b/benchs/bench_fw/benchmark_io.py
@@ -16,6 +16,7 @@
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
 
 import numpy as np
+import submitit
 from faiss.contrib.datasets import (  # @manual=//faiss/contrib:faiss_contrib_gpu
     dataset_from_name,
 )
@@ -46,6 +47,9 @@ def merge_rcq_itq(
 class BenchmarkIO:
     path: str
 
+    def clone(self):
+        return BenchmarkIO(path=self.path)
+
     def __post_init__(self):
         self.cached_ds = {}
 
@@ -106,7 +110,7 @@ def write_file(
         fn = self.get_local_filename(filename)
         with ZipFile(fn, "w") as zip_file:
             for key, value in zip(keys, values, strict=True):
-                with zip_file.open(key, "w") as f:
+                with zip_file.open(key, "w", force_zip64=True) as f:
                     if key in ["D", "I", "R", "lims"]:
                         np.save(f, value)
                     elif key in ["P"]:
@@ -117,22 +121,31 @@ def write_file(
         self.upload_file_to_blobstore(filename, overwrite=overwrite)
 
     def get_dataset(self, dataset):
-        if dataset.namespace is not None and dataset.namespace[:4] == "std_":
-            if dataset.tablename not in self.cached_ds:
-                self.cached_ds[dataset.tablename] = dataset_from_name(
-                    dataset.tablename,
-                )
-            p = dataset.namespace[4]
-            if p == "t":
-                return self.cached_ds[dataset.tablename].get_train()
-            elif p == "d":
-                return self.cached_ds[dataset.tablename].get_database()
-            elif p == "q":
-                return self.cached_ds[dataset.tablename].get_queries()
-            else:
-                raise ValueError
-        elif dataset not in self.cached_ds:
-            if dataset.namespace == "syn":
+        if dataset not in self.cached_ds:
+            if (
+                dataset.namespace is not None
+                and dataset.namespace[:4] == "std_"
+            ):
+                if dataset.tablename not in self.cached_ds:
+                    self.cached_ds[dataset.tablename] = dataset_from_name(
+                        dataset.tablename,
+                    )
+                p = dataset.namespace[4]
+                if p == "t":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_train(dataset.num_vectors)
+                elif p == "d":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_database()
+                elif p == "q":
+                    self.cached_ds[dataset] = self.cached_ds[
+                        dataset.tablename
+                    ].get_queries()
+                else:
+                    raise ValueError
+            elif dataset.namespace == "syn":
                 d, seed = dataset.tablename.split("_")
                 d = int(d)
                 seed = int(seed)
@@ -225,3 +238,31 @@ def write_index(
         logger.info(f"Saving index to {fn}")
         faiss.write_index(index, fn)
         self.upload_file_to_blobstore(filename)
+        assert os.path.exists(fn)
+        return os.path.getsize(fn)
+
+    def launch_jobs(self, func, params, local=True):
+        if local:
+            results = [func(p) for p in params]
+            return results
+        logger.info(f"launching {len(params)} jobs")
+        executor = submitit.AutoExecutor(folder="/checkpoint/gsz/jobs")
+        executor.update_parameters(
+            nodes=1,
+            gpus_per_node=8,
+            cpus_per_task=80,
+            # mem_gb=640,
+            tasks_per_node=1,
+            name="faiss_benchmark",
+            slurm_array_parallelism=512,
+            slurm_partition="scavenge",
+            slurm_time=4 * 60,
+            slurm_constraint="bldg1",
+        )
+        jobs = executor.map_array(func, params)
+        logger.info(f"launched {len(jobs)} jobs")
+        for job, param in zip(jobs, params):
+            logger.info(f"{job.job_id=} {param[0]=}")
+        results = [job.result() for job in jobs]
+        print(f"received {len(results)} results")
+        return results
diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index 15e5b9330b..173b07ce16 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -4,8 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
+import logging
 from typing import Any, Dict, List, Optional
 
+import faiss  # @manual=//faiss/python:pyfaiss_gpu
+from .utils import timer
+
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class IndexDescriptor:
@@ -14,6 +20,7 @@ class IndexDescriptor:
     # but not both at the same time.
     path: Optional[str] = None
     factory: Optional[str] = None
+    codec_alias: Optional[str] = None
     construction_params: Optional[List[Dict[str, int]]] = None
     search_params: Optional[Dict[str, int]] = None
     # range metric definitions
@@ -33,6 +40,10 @@ class IndexDescriptor:
     #    [radius2_from, radius2_to) -> score2
     range_metrics: Optional[Dict[str, Any]] = None
     radius: Optional[float] = None
+    training_size: Optional[int] = None
+
+    def __hash__(self):
+        return hash(str(self))
 
 
 @dataclass
@@ -85,3 +96,23 @@ def get_filename(
             filename += f"_{self.num_vectors}"
         filename += "."
         return filename
+
+    def k_means(self, io, k, dry_run):
+        logger.info(f"k_means {k} {self}")
+        kmeans_vectors = DatasetDescriptor(
+            tablename=f"{self.get_filename()}kmeans_{k}.npy"
+        )
+        meta_filename = kmeans_vectors.tablename + ".json"
+        if not io.file_exist(kmeans_vectors.tablename) or not io.file_exist(
+            meta_filename
+        ):
+            if dry_run:
+                return None, None, kmeans_vectors.tablename
+            x = io.get_dataset(self)
+            kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
+            _, t, _ = timer("k_means", lambda: kmeans.train(x))
+            io.write_nparray(kmeans.centroids, kmeans_vectors.tablename)
+            io.write_json({"k_means_time": t}, meta_filename)
+        else:
+            t = io.read_json(meta_filename)["k_means_time"]
+        return kmeans_vectors, t, None
diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py
index 3405f59561..3deaa4afcf 100644
--- a/benchs/bench_fw/index.py
+++ b/benchs/bench_fw/index.py
@@ -6,18 +6,18 @@
 
 import logging
 import os
+from collections import OrderedDict
+from copy import copy
 from dataclasses import dataclass
-from multiprocessing.pool import ThreadPool
-from time import perf_counter
 from typing import ClassVar, Dict, List, Optional
 
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
-
 import numpy as np
+
 from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
+    knn_intersection_measure,
     OperatingPointsWithRanges,
 )
-
 from faiss.contrib.factory_tools import (  # @manual=//faiss/contrib:faiss_contrib_gpu
     reverse_index_factory,
 )
@@ -27,67 +27,17 @@
 )
 
 from .descriptors import DatasetDescriptor
+from .utils import (
+    distance_ratio_measure,
+    get_cpu_info,
+    refine_distances_knn,
+    refine_distances_range,
+    timer,
+)
 
 logger = logging.getLogger(__name__)
 
 
-def timer(name, func, once=False) -> float:
-    logger.info(f"Measuring {name}")
-    t1 = perf_counter()
-    res = func()
-    t2 = perf_counter()
-    t = t2 - t1
-    repeat = 1
-    if not once and t < 1.0:
-        repeat = int(2.0 // t)
-        logger.info(
-            f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
-        )
-        t1 = perf_counter()
-        for _ in range(repeat):
-            res = func()
-        t2 = perf_counter()
-        t = (t2 - t1) / repeat
-    logger.info(f"Time for {name}: {t:.3f} seconds")
-    return res, t, repeat
-
-
-def refine_distances_knn(
-    D: np.ndarray, I: np.ndarray, xq: np.ndarray, xb: np.ndarray, metric
-):
-    return np.where(
-        I >= 0,
-        np.square(np.linalg.norm(xq[:, None] - xb[I], axis=2))
-        if metric == faiss.METRIC_L2
-        else np.einsum("qd,qkd->qk", xq, xb[I]),
-        D,
-    )
-
-
-def refine_distances_range(
-    lims: np.ndarray,
-    D: np.ndarray,
-    I: np.ndarray,
-    xq: np.ndarray,
-    xb: np.ndarray,
-    metric,
-):
-    with ThreadPool(32) as pool:
-        R = pool.map(
-            lambda i: (
-                np.sum(np.square(xq[i] - xb[I[lims[i]:lims[i + 1]]]), axis=1)
-                if metric == faiss.METRIC_L2
-                else np.tensordot(
-                    xq[i], xb[I[lims[i]:lims[i + 1]]], axes=(0, 1)
-                )
-            )
-            if lims[i + 1] > lims[i]
-            else [],
-            range(len(lims) - 1),
-        )
-    return np.hstack(R)
-
-
 # The classes below are wrappers around Faiss indices, with different
 # implementations for the case when we start with an already trained
 # index (IndexFromCodec) vs factory strings (IndexFromFactory).
@@ -107,6 +57,7 @@ def param_dict_list_to_name(param_dict_list):
         n = ""
         for param_dict in param_dict_list:
             n += IndexBase.param_dict_to_name(param_dict, f"cp{l}")
+            l += 1
         return n
 
     @staticmethod
@@ -115,65 +66,133 @@ def param_dict_to_name(param_dict, prefix="sp"):
             return ""
         n = prefix
         for name, val in param_dict.items():
-            if name != "noop":
-                n += f"_{name}_{val}"
+            if name == "snap":
+                continue
+            if name == "lsq_gpu" and val == 0:
+                continue
+            if name == "use_beam_LUT" and val == 0:
+                continue
+            n += f"_{name}_{val}"
         if n == prefix:
             return ""
         n += "."
         return n
 
     @staticmethod
-    def set_index_param_dict_list(index, param_dict_list):
+    def set_index_param_dict_list(index, param_dict_list, assert_same=False):
         if not param_dict_list:
             return
         index = faiss.downcast_index(index)
         for param_dict in param_dict_list:
             assert index is not None
-            IndexBase.set_index_param_dict(index, param_dict)
+            IndexBase.set_index_param_dict(index, param_dict, assert_same)
             index = faiss.try_extract_index_ivf(index)
+            if index is not None:
+                index = index.quantizer
 
     @staticmethod
-    def set_index_param_dict(index, param_dict):
+    def set_index_param_dict(index, param_dict, assert_same=False):
         if not param_dict:
             return
         for name, val in param_dict.items():
-            IndexBase.set_index_param(index, name, val)
+            IndexBase.set_index_param(index, name, val, assert_same)
 
     @staticmethod
-    def set_index_param(index, name, val):
+    def set_index_param(index, name, val, assert_same=False):
         index = faiss.downcast_index(index)
-
+        val = int(val)
         if isinstance(index, faiss.IndexPreTransform):
             Index.set_index_param(index.index, name, val)
-        elif name == "efSearch":
-            index.hnsw.efSearch
-            index.hnsw.efSearch = int(val)
-        elif name == "efConstruction":
-            index.hnsw.efConstruction
-            index.hnsw.efConstruction = int(val)
-        elif name == "nprobe":
-            index_ivf = faiss.extract_index_ivf(index)
-            index_ivf.nprobe
-            index_ivf.nprobe = int(val)
-        elif name == "k_factor":
-            index.k_factor
-            index.k_factor = int(val)
-        elif name == "parallel_mode":
-            index_ivf = faiss.extract_index_ivf(index)
-            index_ivf.parallel_mode
-            index_ivf.parallel_mode = int(val)
-        elif name == "noop":
-            pass
+            return
+        elif name == "snap":
+            return
+        elif name == "lsq_gpu":
+            if val == 1:
+                ngpus = faiss.get_num_gpus()
+                icm_encoder_factory = faiss.GpuIcmEncoderFactory(ngpus)
+                if isinstance(index, faiss.IndexProductLocalSearchQuantizer):
+                    for i in range(index.plsq.nsplits):
+                        lsq = faiss.downcast_Quantizer(
+                            index.plsq.subquantizer(i)
+                        )
+                        if lsq.icm_encoder_factory is None:
+                            lsq.icm_encoder_factory = icm_encoder_factory
+                else:
+                    if index.lsq.icm_encoder_factory is None:
+                        index.lsq.icm_encoder_factory = icm_encoder_factory
+            return
+        elif name in ["efSearch", "efConstruction"]:
+            obj = index.hnsw
+        elif name in ["nprobe", "parallel_mode"]:
+            obj = faiss.extract_index_ivf(index)
+        elif name in ["use_beam_LUT", "max_beam_size"]:
+            if isinstance(index, faiss.IndexProductResidualQuantizer):
+                obj = [
+                    faiss.downcast_Quantizer(index.prq.subquantizer(i))
+                    for i in range(index.prq.nsplits)
+                ]
+            else:
+                obj = index.rq
+        elif name == "encode_ils_iters":
+            if isinstance(index, faiss.IndexProductLocalSearchQuantizer):
+                obj = [
+                    faiss.downcast_Quantizer(index.plsq.subquantizer(i))
+                    for i in range(index.plsq.nsplits)
+                ]
+            else:
+                obj = index.lsq
         else:
-            raise RuntimeError(f"could not set param {name} on {index}")
+            obj = index
+
+        if not isinstance(obj, list):
+            obj = [obj]
+        for o in obj:
+            test = getattr(o, name)
+            if assert_same and not name == "use_beam_LUT":
+                assert test == val
+            else:
+                setattr(o, name, val)
+
+    @staticmethod
+    def filter_index_param_dict_list(param_dict_list):
+        if (
+            param_dict_list is not None
+            and param_dict_list[0] is not None
+            and "k_factor" in param_dict_list[0]
+        ):
+            filtered = copy(param_dict_list)
+            del filtered[0]["k_factor"]
+            return filtered
+        else:
+            return param_dict_list
 
     def is_flat(self):
-        codec = faiss.downcast_index(self.get_model())
-        return isinstance(codec, faiss.IndexFlat)
+        model = faiss.downcast_index(self.get_model())
+        return isinstance(model, faiss.IndexFlat)
 
     def is_ivf(self):
-        codec = self.get_model()
-        return faiss.try_extract_index_ivf(codec) is not None
+        return False
+        model = self.get_model()
+        return faiss.try_extract_index_ivf(model) is not None
+
+    def is_2layer(self):
+        def is_2layer_(index):
+            index = faiss.downcast_index(index)
+            if isinstance(index, faiss.IndexPreTransform):
+                return is_2layer_(index.index)
+            return isinstance(index, faiss.Index2Layer)
+
+        model = self.get_model()
+        return is_2layer_(model)
+
+    def is_decode_supported(self):
+        model = self.get_model()
+        if isinstance(model, faiss.IndexPreTransform):
+            for i in range(model.chain.size()):
+                vt = faiss.downcast_VectorTransform(model.chain.at(i))
+                if isinstance(vt, faiss.ITQTransform):
+                    return False
+        return True
 
     def is_pretransform(self):
         codec = self.get_model()
@@ -208,12 +227,15 @@ def get_model_name(self):
     def get_model(self):
         raise NotImplementedError
 
+    def get_construction_params(self):
+        raise NotImplementedError
+
     def transform(self, vectors):
         transformed_vectors = DatasetDescriptor(
             tablename=f"{vectors.get_filename()}{self.get_codec_name()}transform.npy"
         )
         if not self.io.file_exist(transformed_vectors.tablename):
-            codec = self.fetch_codec()
+            codec = self.get_codec()
             assert isinstance(codec, faiss.IndexPreTransform)
             transform = faiss.downcast_VectorTransform(codec.chain.at(0))
             x = self.io.get_dataset(vectors)
@@ -221,7 +243,18 @@ def transform(self, vectors):
             self.io.write_nparray(xt, transformed_vectors.tablename)
         return transformed_vectors
 
-    def knn_search_quantizer(self, index, query_vectors, k):
+    def snap(self, vectors):
+        transformed_vectors = DatasetDescriptor(
+            tablename=f"{vectors.get_filename()}{self.get_codec_name()}snap.npy"
+        )
+        if not self.io.file_exist(transformed_vectors.tablename):
+            codec = self.get_codec()
+            x = self.io.get_dataset(vectors)
+            xt = codec.sa_decode(codec.sa_encode(x))
+            self.io.write_nparray(xt, transformed_vectors.tablename)
+        return transformed_vectors
+
+    def knn_search_quantizer(self, query_vectors, k):
         if self.is_pretransform():
             pretransform = self.get_pretransform()
             quantizer_query_vectors = pretransform.transform(query_vectors)
@@ -229,7 +262,11 @@ def knn_search_quantizer(self, index, query_vectors, k):
             pretransform = None
             quantizer_query_vectors = query_vectors
 
-        QD, QI, _, QP = self.get_quantizer(pretransform).knn_search(
+        quantizer, _, _ = self.get_quantizer(
+            dry_run=False, pretransform=pretransform
+        )
+        QD, QI, _, QP, _ = quantizer.knn_search(
+            dry_run=False,
             search_parameters=None,
             query_vectors=quantizer_query_vectors,
             k=k,
@@ -242,20 +279,31 @@ def get_knn_search_name(
         search_parameters: Optional[Dict[str, int]],
         query_vectors: DatasetDescriptor,
         k: int,
+        reconstruct: bool = False,
     ):
         name = self.get_index_name()
         name += Index.param_dict_to_name(search_parameters)
         name += query_vectors.get_filename("q")
         name += f"k_{k}."
+        name += f"t_{self.num_threads}."
+        if reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
         return name
 
     def knn_search(
         self,
+        dry_run,
         search_parameters: Optional[Dict[str, int]],
         query_vectors: DatasetDescriptor,
         k: int,
+        I_gt=None,
+        D_gt=None,
     ):
-        logger.info("knn_seach: begin")
+        logger.info("knn_search: begin")
+        if search_parameters is not None and search_parameters["snap"] == 1:
+            query_vectors = self.snap(query_vectors)
         filename = (
             self.get_knn_search_name(search_parameters, query_vectors, k)
             + "zip"
@@ -264,15 +312,30 @@ def knn_search(
             logger.info(f"Using cached results for {filename}")
             D, I, R, P = self.io.read_file(filename, ["D", "I", "R", "P"])
         else:
-            xq = self.io.get_dataset(query_vectors)
+            if dry_run:
+                return None, None, None, None, filename
             index = self.get_index()
             Index.set_index_param_dict(index, search_parameters)
 
-            if self.is_ivf():
-                xqt, QD, QI, QP = self.knn_search_quantizer(
-                    index, query_vectors, search_parameters["nprobe"]
+            if self.is_2layer():
+                # Index2Layer doesn't support search
+                xq = self.io.get_dataset(query_vectors)
+                xb = index.reconstruct_n(0, index.ntotal)
+                (D, I), t, _ = timer(
+                    "knn_search 2layer", lambda: faiss.knn(xq, xb, k)
                 )
+            elif self.is_ivf() and not isinstance(index, faiss.IndexRefine):
                 index_ivf = faiss.extract_index_ivf(index)
+                nprobe = (
+                    search_parameters["nprobe"]
+                    if search_parameters is not None
+                    and "nprobe" in search_parameters
+                    else index_ivf.nprobe
+                )
+                xqt, QD, QI, QP = self.knn_search_quantizer(
+                    query_vectors=query_vectors,
+                    k=nprobe,
+                )
                 if index_ivf.parallel_mode != 2:
                     logger.info("Setting IVF parallel mode")
                     index_ivf.parallel_mode = 2
@@ -281,22 +344,23 @@ def knn_search(
                     "knn_search_preassigned",
                     lambda: index_ivf.search_preassigned(xqt, k, QI, QD),
                 )
+                # Dref, Iref = index.search(xq, k)
+                # np.testing.assert_array_equal(I, Iref)
+                # np.testing.assert_allclose(D, Dref)
             else:
+                xq = self.io.get_dataset(query_vectors)
                 (D, I), t, _ = timer("knn_search", lambda: index.search(xq, k))
             if self.is_flat() or not hasattr(self, "database_vectors"):  # TODO
                 R = D
             else:
+                xq = self.io.get_dataset(query_vectors)
                 xb = self.io.get_dataset(self.database_vectors)
-                R = refine_distances_knn(D, I, xq, xb, self.metric_type)
+                R = refine_distances_knn(xq, xb, I, self.metric_type)
             P = {
                 "time": t,
-                "index": self.get_index_name(),
-                "codec": self.get_codec_name(),
-                "factory": self.factory if hasattr(self, "factory") else "",
-                "search_params": search_parameters,
                 "k": k,
             }
-            if self.is_ivf():
+            if self.is_ivf() and not isinstance(index, faiss.IndexRefine):
                 stats = faiss.cvar.indexIVF_stats
                 P |= {
                     "quantizer": QP,
@@ -310,16 +374,129 @@ def knn_search(
                     "search_time": int(stats.search_time // repeat),
                 }
             self.io.write_file(filename, ["D", "I", "R", "P"], [D, I, R, P])
-        logger.info("knn_seach: end")
-        return D, I, R, P
+        P |= {
+            "index": self.get_index_name(),
+            "codec": self.get_codec_name(),
+            "factory": self.get_model_name(),
+            "construction_params": self.get_construction_params(),
+            "search_params": search_parameters,
+            "knn_intersection": knn_intersection_measure(
+                I,
+                I_gt,
+            )
+            if I_gt is not None
+            else None,
+            "distance_ratio": distance_ratio_measure(
+                I,
+                R,
+                D_gt,
+                self.metric_type,
+            )
+            if D_gt is not None
+            else None,
+        }
+        logger.info("knn_search: end")
+        return D, I, R, P, None
+
+    def reconstruct(
+        self,
+        dry_run,
+        parameters: Optional[Dict[str, int]],
+        query_vectors: DatasetDescriptor,
+        k: int,
+        I_gt,
+    ):
+        logger.info("reconstruct: begin")
+        filename = (
+            self.get_knn_search_name(
+                parameters, query_vectors, k, reconstruct=True
+            )
+            + "zip"
+        )
+        if self.io.file_exist(filename):
+            logger.info(f"Using cached results for {filename}")
+            (P,) = self.io.read_file(filename, ["P"])
+            P["index"] = self.get_index_name()
+            P["codec"] = self.get_codec_name()
+            P["factory"] = self.get_model_name()
+            P["reconstruct_params"] = parameters
+            P["construction_params"] = self.get_construction_params()
+        else:
+            if dry_run:
+                return None, filename
+            codec = self.get_codec()
+            codec_meta = self.fetch_meta()
+            Index.set_index_param_dict(codec, parameters)
+            xb = self.io.get_dataset(self.database_vectors)
+            xb_encoded, encode_t, _ = timer(
+                "sa_encode", lambda: codec.sa_encode(xb)
+            )
+            xq = self.io.get_dataset(query_vectors)
+            if self.is_decode_supported():
+                xb_decoded, decode_t, _ = timer(
+                    "sa_decode", lambda: codec.sa_decode(xb_encoded)
+                )
+                mse = np.square(xb_decoded - xb).sum(axis=1).mean().item()
+                _, I = faiss.knn(xq, xb_decoded, k, metric=self.metric_type)
+                asym_recall = knn_intersection_measure(I, I_gt)
+                xq_decoded = codec.sa_decode(codec.sa_encode(xq))
+                _, I = faiss.knn(
+                    xq_decoded, xb_decoded, k, metric=self.metric_type
+                )
+            else:
+                mse = None
+                asym_recall = None
+                decode_t = None
+                # assume hamming for sym
+                xq_encoded = codec.sa_encode(xq)
+                bin = faiss.IndexBinaryFlat(xq_encoded.shape[1] * 8)
+                bin.add(xb_encoded)
+                _, I = bin.search(xq_encoded, k)
+            sym_recall = knn_intersection_measure(I, I_gt)
+            P = {
+                "encode_time": encode_t,
+                "decode_time": decode_t,
+                "mse": mse,
+                "sym_recall": sym_recall,
+                "asym_recall": asym_recall,
+                "cpu": get_cpu_info(),
+                "num_threads": self.num_threads,
+                "index": self.get_index_name(),
+                "codec": self.get_codec_name(),
+                "factory": self.get_model_name(),
+                "reconstruct_params": parameters,
+                "construction_params": self.get_construction_params(),
+                "codec_meta": codec_meta,
+            }
+            self.io.write_file(filename, ["P"], [P])
+        logger.info("reconstruct: end")
+        return P, None
+
+    def get_range_search_name(
+        self,
+        search_parameters: Optional[Dict[str, int]],
+        query_vectors: DatasetDescriptor,
+        radius: Optional[float] = None,
+    ):
+        name = self.get_index_name()
+        name += Index.param_dict_to_name(search_parameters)
+        name += query_vectors.get_filename("q")
+        if radius is not None:
+            name += f"r_{int(radius * 1000)}."
+        else:
+            name += "r_auto."
+        return name
 
     def range_search(
         self,
+        dry_run,
         search_parameters: Optional[Dict[str, int]],
         query_vectors: DatasetDescriptor,
         radius: Optional[float] = None,
     ):
         logger.info("range_search: begin")
+        if search_parameters is not None and search_parameters.get("snap") == 1:
+            query_vectors = self.snap(query_vectors)
         filename = (
             self.get_range_search_name(
                 search_parameters, query_vectors, radius
@@ -332,13 +509,15 @@ def range_search(
                 filename, ["lims", "D", "I", "R", "P"]
             )
         else:
+            if dry_run:
+                return None, None, None, None, None, filename
             xq = self.io.get_dataset(query_vectors)
             index = self.get_index()
             Index.set_index_param_dict(index, search_parameters)
 
             if self.is_ivf():
                 xqt, QD, QI, QP = self.knn_search_quantizer(
-                    index, query_vectors, search_parameters["nprobe"]
+                    query_vectors, search_parameters["nprobe"]
                 )
                 index_ivf = faiss.extract_index_ivf(index)
                 if index_ivf.parallel_mode != 2:
@@ -364,9 +543,6 @@ def range_search(
                 )
             P = {
                 "time": t,
-                "index": self.get_codec_name(),
-                "codec": self.get_codec_name(),
-                "search_params": search_parameters,
                 "radius": radius,
                 "count": len(I),
             }
@@ -386,8 +562,15 @@ def range_search(
             self.io.write_file(
                 filename, ["lims", "D", "I", "R", "P"], [lims, D, I, R, P]
             )
+        P |= {
+            "index": self.get_index_name(),
+            "codec": self.get_codec_name(),
+            "factory": self.get_model_name(),
+            "construction_params": self.get_construction_params(),
+            "search_params": search_parameters,
+        }
         logger.info("range_seach: end")
-        return lims, D, I, R, P
+        return lims, D, I, R, P, None
 
 
 # Common base for IndexFromCodec and IndexFromFactory,
@@ -396,16 +579,15 @@ def range_search(
 # they share the configuration of their parent IndexFromCodec
 @dataclass
 class Index(IndexBase):
+    num_threads: int
     d: int
     metric: str
     database_vectors: DatasetDescriptor
     construction_params: List[Dict[str, int]]
     search_params: Dict[str, int]
 
-    cached_codec_name: ClassVar[str] = None
-    cached_codec: ClassVar[faiss.Index] = None
-    cached_index_name: ClassVar[str] = None
-    cached_index: ClassVar[faiss.Index] = None
+    cached_codec: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
+    cached_index: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
 
     def __post_init__(self):
         if isinstance(self.metric, str):
@@ -438,16 +620,13 @@ def supports_range_search(self):
     def fetch_codec(self):
         raise NotImplementedError
 
-    def train(self):
-        # get triggers a train, if necessary
-        self.get_codec()
-
     def get_codec(self):
         codec_name = self.get_codec_name()
-        if Index.cached_codec_name != codec_name:
-            Index.cached_codec = self.fetch_codec()
-            Index.cached_codec_name = codec_name
-        return Index.cached_codec
+        if codec_name not in Index.cached_codec:
+            Index.cached_codec[codec_name], _, _ = self.fetch_codec()
+            if len(Index.cached_codec) > 1:
+                Index.cached_codec.popitem(last=False)
+        return Index.cached_codec[codec_name]
 
     def get_index_name(self):
         name = self.get_codec_name()
@@ -456,14 +635,16 @@ def get_index_name(self):
         return name
 
     def fetch_index(self):
-        index = faiss.clone_index(self.get_codec())
+        index = self.get_codec()
+        index.reset()
         assert index.ntotal == 0
         logger.info("Adding vectors to index")
         xb = self.io.get_dataset(self.database_vectors)
 
-        if self.is_ivf():
+        if self.is_ivf() and not isinstance(index, faiss.IndexRefine):
             xbt, QD, QI, QP = self.knn_search_quantizer(
-                index, self.database_vectors, 1
+                query_vectors=self.database_vectors,
+                k=1,
             )
             index_ivf = faiss.extract_index_ivf(index)
             if index_ivf.parallel_mode != 2:
@@ -483,32 +664,43 @@ def fetch_index(self):
             )
         assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0]
         logger.info("Added vectors to index")
-        return index
+        return index, t
 
     def get_index(self):
         index_name = self.get_index_name()
-        if Index.cached_index_name != index_name:
-            Index.cached_index = self.fetch_index()
-            Index.cached_index_name = index_name
-        return Index.cached_index
+        if index_name not in Index.cached_index:
+            Index.cached_index[index_name], _ = self.fetch_index()
+            if len(Index.cached_index) > 3:
+                Index.cached_index.popitem(last=False)
+        return Index.cached_index[index_name]
+
+    def get_construction_params(self):
+        return self.construction_params
 
-    def get_code_size(self):
+    def get_code_size(self, codec=None):
         def get_index_code_size(index):
             index = faiss.downcast_index(index)
             if isinstance(index, faiss.IndexPreTransform):
                 return get_index_code_size(index.index)
-            elif isinstance(index, faiss.IndexHNSWFlat):
-                return index.d * 4  # TODO
             elif type(index) in [faiss.IndexRefine, faiss.IndexRefineFlat]:
                 return get_index_code_size(
                     index.base_index
                 ) + get_index_code_size(index.refine_index)
             else:
-                return index.code_size
+                return index.code_size if hasattr(index, "code_size") else 0
 
-        codec = self.get_codec()
+        if codec is None:
+            codec = self.get_codec()
         return get_index_code_size(codec)
 
+    def get_sa_code_size(self, codec=None):
+        if codec is None:
+            codec = self.get_codec()
+        try:
+            return codec.sa_code_size()
+        except:
+            return None
+
     def get_operating_points(self):
         op = OperatingPointsWithRanges()
 
@@ -520,44 +712,72 @@ def add_range_or_val(name, range):
                 else range,
             )
 
-        op.add_range("noop", [0])
-        codec = faiss.downcast_index(self.get_codec())
-        codec_ivf = faiss.try_extract_index_ivf(codec)
-        if codec_ivf is not None:
+        add_range_or_val("snap", [0])
+        model = self.get_model()
+        model_ivf = faiss.try_extract_index_ivf(model)
+        if model_ivf is not None:
             add_range_or_val(
                 "nprobe",
-                [
-                    2**i
-                    for i in range(12)
-                    if 2**i <= codec_ivf.nlist * 0.25
-                ],
+                [2**i for i in range(12) if 2**i <= model_ivf.nlist * 0.5],
+                # [1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28] + [
+                #     i
+                #     for i in range(32, 64, 8)
+                #     if i <= model_ivf.nlist * 0.1
+                # ] + [
+                #     i
+                #     for i in range(64, 128, 16)
+                #     if i <= model_ivf.nlist * 0.1
+                # ] + [
+                #     i
+                #     for i in range(128, 256, 32)
+                #     if i <= model_ivf.nlist * 0.1
+                # ] + [
+                #     i
+                #     for i in range(256, 512, 64)
+                #     if i <= model_ivf.nlist * 0.1
+                # ] + [
+                #     2**i
+                #     for i in range(9, 12)
+                #     if 2**i <= model_ivf.nlist * 0.1
+                # ],
             )
-        if isinstance(codec, faiss.IndexRefine):
+        model = faiss.downcast_index(model)
+        if isinstance(model, faiss.IndexRefine):
             add_range_or_val(
                 "k_factor",
-                [2**i for i in range(11)],
+                [2**i for i in range(13)],
             )
-        if isinstance(codec, faiss.IndexHNSWFlat):
+        elif isinstance(model, faiss.IndexHNSWFlat):
             add_range_or_val(
                 "efSearch",
                 [2**i for i in range(3, 11)],
             )
+        elif isinstance(model, faiss.IndexResidualQuantizer) or isinstance(
+            model, faiss.IndexProductResidualQuantizer
+        ):
+            add_range_or_val(
+                "max_beam_size",
+                [1, 2, 4, 8, 16, 32],
+            )
+            add_range_or_val(
+                "use_beam_LUT",
+                [1],
+            )
+        elif isinstance(model, faiss.IndexLocalSearchQuantizer) or isinstance(
+            model, faiss.IndexProductLocalSearchQuantizer
+        ):
+            add_range_or_val(
+                "encode_ils_iters",
+                [2, 4, 8, 16],
+            )
+            add_range_or_val(
+                "lsq_gpu",
+                [1],
+            )
         return op
 
-    def get_range_search_name(
-        self,
-        search_parameters: Optional[Dict[str, int]],
-        query_vectors: DatasetDescriptor,
-        radius: Optional[float] = None,
-    ):
-        name = self.get_index_name()
-        name += Index.param_dict_to_name(search_parameters)
-        name += query_vectors.get_filename("q")
-        if radius is not None:
-            name += f"r_{int(radius * 1000)}."
-        else:
-            name += "r_auto."
-        return name
+    def is_flat_index(self):
+        return self.get_index_name().startswith("Flat")
 
 
 # IndexFromCodec, IndexFromQuantizer and IndexFromPreTransform
@@ -581,12 +801,18 @@ def get_pretransform(self):
         quantizer.set_io(self.io)
         return quantizer
 
+    def get_model_name(self):
+        return os.path.basename(self.path)
+
     def get_codec_name(self):
         assert self.path is not None
         name = os.path.basename(self.path)
         name += Index.param_dict_list_to_name(self.construction_params)
         return name
 
+    def fetch_meta(self, dry_run=False):
+        return None, None
+
     def fetch_codec(self):
         codec = self.io.read_index(
             os.path.basename(self.path),
@@ -596,7 +822,7 @@ def fetch_codec(self):
         assert self.d == codec.d
         assert self.metric_type == codec.metric_type
         Index.set_index_param_dict_list(codec, self.construction_params)
-        return codec
+        return codec, None, None
 
     def get_model(self):
         return self.get_codec()
@@ -609,6 +835,9 @@ def __init__(self, ivf_index: Index):
         self.ivf_index = ivf_index
         super().__init__()
 
+    def get_model_name(self):
+        return self.get_index_name()
+
     def get_codec_name(self):
         return self.get_index_name()
 
@@ -657,17 +886,52 @@ def get_codec_name(self):
         name += Index.param_dict_list_to_name(self.construction_params)
         return name
 
-    def fetch_codec(self):
+    def fetch_meta(self, dry_run=False):
+        meta_filename = self.get_codec_name() + "json"
+        if self.io.file_exist(meta_filename):
+            meta = self.io.read_json(meta_filename)
+            report = None
+        else:
+            _, meta, report = self.fetch_codec(dry_run=dry_run)
+        return meta, report
+
+    def fetch_codec(self, dry_run=False):
         codec_filename = self.get_codec_name() + "codec"
-        if self.io.file_exist(codec_filename):
+        meta_filename = self.get_codec_name() + "json"
+        if self.io.file_exist(codec_filename) and self.io.file_exist(
+            meta_filename
+        ):
             codec = self.io.read_index(codec_filename)
             assert self.d == codec.d
             assert self.metric_type == codec.metric_type
+            meta = self.io.read_json(meta_filename)
         else:
-            codec = self.assemble()
-            if self.factory != "Flat":
-                self.io.write_index(codec, codec_filename)
-        return codec
+            codec, training_time, requires = self.assemble(dry_run=dry_run)
+            if requires is not None:
+                assert dry_run
+                if requires == "":
+                    return None, None, codec_filename
+                else:
+                    return None, None, requires
+            codec_size = self.io.write_index(codec, codec_filename)
+            assert codec_size is not None
+            meta = {
+                "training_time": training_time,
+                "training_size": self.training_vectors.num_vectors if self.training_vectors else 0,
+                "codec_size": codec_size,
+                "sa_code_size": self.get_sa_code_size(codec),
+                "code_size": self.get_code_size(codec),
+                "cpu": get_cpu_info(),
+            }
+            self.io.write_json(meta, meta_filename, overwrite=True)
+
+        Index.set_index_param_dict_list(
+            codec, self.construction_params, assert_same=True
+        )
+        return codec, meta, None
+
+    def get_model_name(self):
+        return self.factory
 
     def get_model(self):
         model = faiss.index_factory(self.d, self.factory, self.metric_type)
@@ -675,27 +939,27 @@ def get_model(self):
         return model
 
     def get_pretransform(self):
-        model = faiss.index_factory(self.d, self.factory, self.metric_type)
+        model = self.get_model()
         assert isinstance(model, faiss.IndexPreTransform)
         sub_index = faiss.downcast_index(model.index)
         if isinstance(sub_index, faiss.IndexFlat):
             return self
         # replace the sub-index with Flat
-        codec = faiss.clone_index(model)
-        codec.index = faiss.IndexFlat(codec.index.d, codec.index.metric_type)
+        model.index = faiss.IndexFlat(model.index.d, model.index.metric_type)
         pretransform = IndexFromFactory(
-            d=codec.d,
-            metric=codec.metric_type,
+            num_threads=self.num_threads,
+            d=model.d,
+            metric=model.metric_type,
             database_vectors=self.database_vectors,
             construction_params=self.construction_params,
-            search_params=self.search_params,
-            factory=reverse_index_factory(codec),
+            search_params=None,
+            factory=reverse_index_factory(model),
             training_vectors=self.training_vectors,
         )
         pretransform.set_io(self.io)
         return pretransform
 
-    def get_quantizer(self, pretransform=None):
+    def get_quantizer(self, dry_run, pretransform=None):
         model = self.get_model()
         model_ivf = faiss.extract_index_ivf(model)
         assert isinstance(model_ivf, faiss.IndexIVF)
@@ -704,82 +968,124 @@ def get_quantizer(self, pretransform=None):
             training_vectors = self.training_vectors
         else:
             training_vectors = pretransform.transform(self.training_vectors)
-        centroids = self.k_means(training_vectors, model_ivf.nlist)
+        centroids, t, requires = training_vectors.k_means(
+            self.io, model_ivf.nlist, dry_run
+        )
+        if requires is not None:
+            return None, None, requires
         quantizer = IndexFromFactory(
+            num_threads=self.num_threads,
             d=model_ivf.quantizer.d,
             metric=model_ivf.quantizer.metric_type,
             database_vectors=centroids,
-            construction_params=None,  # self.construction_params[1:],
-            search_params=None,  # self.construction_params[0],  # TODO: verify
+            construction_params=self.construction_params[1:]
+            if self.construction_params is not None
+            else None,
+            search_params=None,
             factory=reverse_index_factory(model_ivf.quantizer),
             training_vectors=centroids,
         )
         quantizer.set_io(self.io)
-        return quantizer
+        return quantizer, t, None
 
-    def k_means(self, vectors, k):
-        kmeans_vectors = DatasetDescriptor(
-            tablename=f"{vectors.get_filename()}kmeans_{k}.npy"
-        )
-        if not self.io.file_exist(kmeans_vectors.tablename):
-            x = self.io.get_dataset(vectors)
-            kmeans = faiss.Kmeans(d=x.shape[1], k=k, gpu=True)
-            kmeans.train(x)
-            self.io.write_nparray(kmeans.centroids, kmeans_vectors.tablename)
-        return kmeans_vectors
-
-    def assemble(self):
+    def assemble(self, dry_run):
+        logger.info(f"assemble {self.factory}")
         model = self.get_model()
-        codec = faiss.clone_index(model)
-        if isinstance(model, faiss.IndexPreTransform):
-            sub_index = faiss.downcast_index(model.index)
-            if not isinstance(sub_index, faiss.IndexFlat):
-                # replace the sub-index with Flat and fetch pre-trained
-                pretransform = self.get_pretransform()
-                codec = pretransform.fetch_codec()
-                assert codec.is_trained
-                transformed_training_vectors = pretransform.transform(
-                    self.training_vectors
-                )
-                transformed_database_vectors = pretransform.transform(
-                    self.database_vectors
-                )
-                # replace the Flat index with the required sub-index
+        opaque = True
+        t_aggregate = 0
+        # try:
+        #     reverse_index_factory(model)
+        #     opaque = False
+        # except NotImplementedError:
+        #     opaque = True
+        if opaque:
+            codec = model
+        else:
+            if isinstance(model, faiss.IndexPreTransform):
+                logger.info(f"assemble: pretransform {self.factory}")
+                sub_index = faiss.downcast_index(model.index)
+                if not isinstance(sub_index, faiss.IndexFlat):
+                    # replace the sub-index with Flat and fetch pre-trained
+                    pretransform = self.get_pretransform()
+                    codec, meta, report = pretransform.fetch_codec(
+                        dry_run=dry_run
+                    )
+                    if report is not None:
+                        return None, None, report
+                    t_aggregate += meta["training_time"]
+                    assert codec.is_trained
+                    transformed_training_vectors = pretransform.transform(
+                        self.training_vectors
+                    )
+                    # replace the Flat index with the required sub-index
+                    wrapper = IndexFromFactory(
+                        num_threads=self.num_threads,
+                        d=sub_index.d,
+                        metric=sub_index.metric_type,
+                        database_vectors=None,
+                        construction_params=self.construction_params,
+                        search_params=None,
+                        factory=reverse_index_factory(sub_index),
+                        training_vectors=transformed_training_vectors,
+                    )
+                    wrapper.set_io(self.io)
+                    codec.index, meta, report = wrapper.fetch_codec(
+                        dry_run=dry_run
+                    )
+                    if report is not None:
+                        return None, None, report
+                    t_aggregate += meta["training_time"]
+                    assert codec.index.is_trained
+                else:
+                    codec = model
+            elif isinstance(model, faiss.IndexIVF):
+                logger.info(f"assemble: ivf {self.factory}")
+                # replace the quantizer
+                quantizer, t, requires = self.get_quantizer(dry_run=dry_run)
+                if requires is not None:
+                    return None, None, requires
+                t_aggregate += t
+                codec = faiss.clone_index(model)
+                quantizer_index, t = quantizer.fetch_index()
+                t_aggregate += t
+                replace_ivf_quantizer(codec, quantizer_index)
+                assert codec.quantizer.is_trained
+                assert codec.nlist == codec.quantizer.ntotal
+            elif isinstance(model, faiss.IndexRefine) or isinstance(
+                model, faiss.IndexRefineFlat
+            ):
+                logger.info(f"assemble: refine {self.factory}")
+                # replace base_index
                 wrapper = IndexFromFactory(
-                    d=sub_index.d,
-                    metric=sub_index.metric_type,
-                    database_vectors=transformed_database_vectors,
-                    construction_params=self.construction_params,
-                    search_params=self.search_params,
-                    factory=reverse_index_factory(sub_index),
-                    training_vectors=transformed_training_vectors,
+                    num_threads=self.num_threads,
+                    d=model.base_index.d,
+                    metric=model.base_index.metric_type,
+                    database_vectors=self.database_vectors,
+                    construction_params=IndexBase.filter_index_param_dict_list(
+                        self.construction_params
+                    ),
+                    search_params=None,
+                    factory=reverse_index_factory(model.base_index),
+                    training_vectors=self.training_vectors,
                 )
                 wrapper.set_io(self.io)
-                codec.index = wrapper.fetch_codec()
-                assert codec.index.is_trained
-        elif isinstance(model, faiss.IndexIVF):
-            # replace the quantizer
-            quantizer = self.get_quantizer()
-            replace_ivf_quantizer(codec, quantizer.fetch_index())
-            assert codec.quantizer.is_trained
-            assert codec.nlist == codec.quantizer.ntotal
-        elif isinstance(model, faiss.IndexRefine) or isinstance(
-            model, faiss.IndexRefineFlat
-        ):
-            # replace base_index
-            wrapper = IndexFromFactory(
-                d=model.base_index.d,
-                metric=model.base_index.metric_type,
-                database_vectors=self.database_vectors,
-                construction_params=self.construction_params,
-                search_params=self.search_params,
-                factory=reverse_index_factory(model.base_index),
-                training_vectors=self.training_vectors,
-            )
-            wrapper.set_io(self.io)
-            codec.base_index = wrapper.fetch_codec()
-            assert codec.base_index.is_trained
+                codec = faiss.clone_index(model)
+                codec.base_index, meta, requires = wrapper.fetch_codec(
+                    dry_run=dry_run
+                )
+                if requires is not None:
+                    return None, None, requires
+                t_aggregate += meta["training_time"]
+                assert codec.base_index.is_trained
+            else:
+                codec = model
 
-        xt = self.io.get_dataset(self.training_vectors)
-        codec.train(xt)
-        return codec
+        if self.factory != "Flat":
+            if dry_run:
+                return None, None, ""
+            logger.info(f"assemble, train {self.factory}")
+            xt = self.io.get_dataset(self.training_vectors)
+            _, t, _ = timer("train", lambda: codec.train(xt), once=True)
+            t_aggregate += t
+
+        return codec, t_aggregate, None
diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py
new file mode 100644
index 0000000000..a2653b7144
--- /dev/null
+++ b/benchs/bench_fw/optimize.py
@@ -0,0 +1,334 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import faiss  # @manual=//faiss/python:pyfaiss_gpu
+
+# from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
+#     OperatingPoints,
+# )
+
+from .benchmark import Benchmark
+from .descriptors import DatasetDescriptor, IndexDescriptor
+from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Optimizer:
+    distance_metric: str = "L2"
+    num_threads: int = 32
+    run_local: bool = True
+
+    def __post_init__(self):
+        self.cached_benchmark = None
+        if self.distance_metric == "IP":
+            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
+        elif self.distance_metric == "L2":
+            self.distance_metric_type = faiss.METRIC_L2
+        else:
+            raise ValueError
+
+    def set_io(self, benchmark_io):
+        self.io = benchmark_io
+        self.io.distance_metric = self.distance_metric
+        self.io.distance_metric_type = self.distance_metric_type
+
+    def benchmark_and_filter_candidates(
+        self,
+        index_descs,
+        training_vectors,
+        database_vectors,
+        query_vectors,
+        result_file,
+        include_flat,
+        min_accuracy,
+        pareto_metric,
+    ):
+        benchmark = Benchmark(
+            num_threads=self.num_threads,
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            index_descs=index_descs,
+            k=10,
+            distance_metric=self.distance_metric,
+        )
+        benchmark.set_io(self.io)
+        results = benchmark.benchmark(
+            result_file=result_file, local=self.run_local, train=True, knn=True
+        )
+        assert results
+        filtered = filter_results(
+            results=results,
+            evaluation="knn",
+            accuracy_metric="knn_intersection",
+            min_accuracy=min_accuracy,
+            name_filter=None
+            if include_flat
+            else (lambda n: not n.startswith("Flat")),
+            pareto_mode=ParetoMode.GLOBAL,
+            pareto_metric=pareto_metric,
+        )
+        assert filtered
+        index_descs = [
+            IndexDescriptor(
+                factory=v["factory"],
+                construction_params=v["construction_params"],
+                search_params=v["search_params"],
+            )
+            for _, _, _, _, v in filtered
+        ]
+        return index_descs, filtered
+
+    def optimize_quantizer(
+        self,
+        training_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlists: List[int],
+        min_accuracy: float,
+    ):
+        quantizer_descs = {}
+        for nlist in nlists:
+            # cluster
+            centroids, _, _ = training_vectors.k_means(
+                self.io,
+                nlist,
+                dry_run=False,
+            )
+
+            descs = [IndexDescriptor(factory="Flat"),] + [
+                IndexDescriptor(
+                    factory="HNSW32",
+                    construction_params=[{"efConstruction": 2**i}],
+                )
+                for i in range(6, 11)
+            ]
+
+            descs, _ = self.benchmark_and_filter_candidates(
+                descs,
+                training_vectors=centroids,
+                database_vectors=centroids,
+                query_vectors=query_vectors,
+                result_file=f"result_{centroids.get_filename()}json",
+                include_flat=True,
+                min_accuracy=min_accuracy,
+                pareto_metric=ParetoMetric.TIME,
+            )
+            quantizer_descs[nlist] = descs
+
+        return quantizer_descs
+
+    def optimize_ivf(
+        self,
+        result_file: str,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        quantizers: Dict[int, List[IndexDescriptor]],
+        codecs: List[Tuple[str, str]],
+        min_accuracy: float,
+    ):
+        ivf_descs = []
+        for nlist, quantizer_descs in quantizers.items():
+            # build IVF index
+            for quantizer_desc in quantizer_descs:
+                for pretransform, fine_ivf in codecs:
+                    if pretransform is None:
+                        pretransform = ""
+                    else:
+                        pretransform = pretransform + ","
+                    if quantizer_desc.construction_params is None:
+                        construction_params = [
+                            None,
+                            quantizer_desc.search_params,
+                        ]
+                    else:
+                        construction_params = [
+                            None
+                        ] + quantizer_desc.construction_params
+                        if quantizer_desc.search_params is not None:
+                            dict_merge(
+                                construction_params[1],
+                                quantizer_desc.search_params,
+                            )
+                    ivf_descs.append(
+                        IndexDescriptor(
+                            factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
+                            construction_params=construction_params,
+                        )
+                    )
+        return self.benchmark_and_filter_candidates(
+            ivf_descs,
+            training_vectors,
+            database_vectors,
+            query_vectors,
+            result_file,
+            include_flat=False,
+            min_accuracy=min_accuracy,
+            pareto_metric=ParetoMetric.TIME_SPACE,
+        )
+
+    # train an IVFFlat index
+    # find the nprobe required for the given accuracy
+    def ivf_flat_nprobe_required_for_accuracy(
+        self,
+        result_file: str,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlist,
+        accuracy,
+    ):
+        _, results = self.benchmark_and_filter_candidates(
+            index_descs=[
+                IndexDescriptor(factory=f"IVF{nlist}(Flat),Flat"),
+            ],
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            result_file=result_file,
+            include_flat=False,
+            min_accuracy=accuracy,
+            pareto_metric=ParetoMetric.TIME,
+        )
+        nprobe = nlist // 2
+        for _, _, _, k, v in results:
+            if (
+                ".knn" in k
+                and "nprobe" in v["search_params"]
+                and v["knn_intersection"] >= accuracy
+            ):
+                nprobe = min(nprobe, v["search_params"]["nprobe"])
+        return nprobe
+
+    # train candidate IVF codecs
+    # benchmark them at the same nprobe
+    # keep only the space _and_ time Pareto optimal
+    def optimize_codec(
+        self,
+        result_file: str,
+        d: int,
+        training_vectors: DatasetDescriptor,
+        database_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        nlist: int,
+        nprobe: int,
+        min_accuracy: float,
+    ):
+        codecs = (
+            [
+                (None, "Flat"),
+                (None, "SQfp16"),
+                (None, "SQbf16"),
+                (None, "SQ8"),
+            ] + [
+                (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
+                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
+                if d % M == 0
+                for dim in range(2, 18, 2)
+                if M * dim <= d
+                for b in range(4, 14, 2)
+                if M * b < d * 8  # smaller than SQ8
+            ] + [
+                (None, f"PQ{M}x{b}")
+                for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
+                if d % M == 0
+                for b in range(8, 14, 2)
+                if M * b < d * 8  # smaller than SQ8
+            ]
+        )
+        factory = {}
+        for opq, pq in codecs:
+            factory[
+                f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
+            ] = (
+                opq,
+                pq,
+            )
+
+        _, filtered = self.benchmark_and_filter_candidates(
+            index_descs=[
+                IndexDescriptor(
+                    factory=f"IVF{nlist},{pq}"
+                    if opq is None
+                    else f"{opq},IVF{nlist},{pq}",
+                    search_params={
+                        "nprobe": nprobe,
+                    },
+                )
+                for opq, pq in codecs
+            ],
+            training_vectors=training_vectors,
+            database_vectors=database_vectors,
+            query_vectors=query_vectors,
+            result_file=result_file,
+            include_flat=False,
+            min_accuracy=min_accuracy,
+            pareto_metric=ParetoMetric.TIME_SPACE,
+        )
+        results = [
+            factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
+        ]
+        return results
+
+    def optimize(
+        self,
+        d: int,
+        training_vectors: DatasetDescriptor,
+        database_vectors_list: List[DatasetDescriptor],
+        query_vectors: DatasetDescriptor,
+        min_accuracy: float,
+    ):
+        # train an IVFFlat index
+        # find the nprobe required for near perfect accuracy
+        nlist = 4096
+        nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
+            result_file=f"result_ivf{nlist}_flat.json",
+            training_vectors=training_vectors,
+            database_vectors=database_vectors_list[0],
+            query_vectors=query_vectors,
+            nlist=nlist,
+            accuracy=0.95,
+        )
+
+        # train candidate IVF codecs
+        # benchmark them at the same nprobe
+        # keep only the space and time Pareto optima
+        codecs = self.optimize_codec(
+            result_file=f"result_ivf{nlist}_codec.json",
+            d=d,
+            training_vectors=training_vectors,
+            database_vectors=database_vectors_list[0],
+            query_vectors=query_vectors,
+            nlist=nlist,
+            nprobe=nprobe_at_95,
+            min_accuracy=min_accuracy,
+        )
+
+        # optimize coarse quantizers
+        quantizers = self.optimize_quantizer(
+            training_vectors=training_vectors,
+            query_vectors=query_vectors,
+            nlists=[4096, 8192, 16384, 32768],
+            min_accuracy=0.7,
+        )
+
+        # combine them with the codecs
+        # test them at different scales
+        for database_vectors in database_vectors_list:
+            self.optimize_ivf(
+                result_file=f"result_{database_vectors.get_filename()}json",
+                training_vectors=training_vectors,
+                database_vectors=database_vectors,
+                query_vectors=query_vectors,
+                quantizers=quantizers,
+                codecs=codecs,
+                min_accuracy=min_accuracy,
+            )
diff --git a/benchs/bench_fw/utils.py b/benchs/bench_fw/utils.py
new file mode 100644
index 0000000000..3151c0c2da
--- /dev/null
+++ b/benchs/bench_fw/utils.py
@@ -0,0 +1,248 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+from enum import Enum
+from multiprocessing.pool import ThreadPool
+from time import perf_counter
+
+import faiss  # @manual=//faiss/python:pyfaiss_gpu
+import numpy as np
+
+from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
+    OperatingPoints,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def timer(name, func, once=False) -> float:
+    logger.info(f"Measuring {name}")
+    t1 = perf_counter()
+    res = func()
+    t2 = perf_counter()
+    t = t2 - t1
+    repeat = 1
+    if not once and t < 1.0:
+        repeat = int(2.0 // t)
+        logger.info(
+            f"Time for {name}: {t:.3f} seconds, repeating {repeat} times"
+        )
+        t1 = perf_counter()
+        for _ in range(repeat):
+            res = func()
+        t2 = perf_counter()
+        t = (t2 - t1) / repeat
+    logger.info(f"Time for {name}: {t:.3f} seconds")
+    return res, t, repeat
+
+
+def refine_distances_knn(
+    xq: np.ndarray,
+    xb: np.ndarray,
+    I: np.ndarray,
+    metric,
+):
+    """Recompute distances between xq[i] and xb[I[i, :]]"""
+    nq, k = I.shape
+    xq = np.ascontiguousarray(xq, dtype="float32")
+    nq2, d = xq.shape
+    xb = np.ascontiguousarray(xb, dtype="float32")
+    nb, d2 = xb.shape
+    I = np.ascontiguousarray(I, dtype="int64")
+    assert nq2 == nq
+    assert d2 == d
+    D = np.empty(I.shape, dtype="float32")
+    D[:] = np.inf
+    if metric == faiss.METRIC_L2:
+        faiss.fvec_L2sqr_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    else:
+        faiss.fvec_inner_products_by_idx(
+            faiss.swig_ptr(D),
+            faiss.swig_ptr(xq),
+            faiss.swig_ptr(xb),
+            faiss.swig_ptr(I),
+            d,
+            nq,
+            k,
+        )
+    return D
+
+
+def refine_distances_range(
+    lims: np.ndarray,
+    D: np.ndarray,
+    I: np.ndarray,
+    xq: np.ndarray,
+    xb: np.ndarray,
+    metric,
+):
+    with ThreadPool(32) as pool:
+        R = pool.map(
+            lambda i: (
+                np.sum(np.square(xq[i] - xb[I[lims[i] : lims[i + 1]]]), axis=1)
+                if metric == faiss.METRIC_L2
+                else np.tensordot(
+                    xq[i], xb[I[lims[i] : lims[i + 1]]], axes=(0, 1)
+                )
+            )
+            if lims[i + 1] > lims[i]
+            else [],
+            range(len(lims) - 1),
+        )
+    return np.hstack(R)
+
+
+def distance_ratio_measure(I, R, D_GT, metric):
+    sum_of_R = np.sum(np.where(I >= 0, R, 0))
+    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
+    if metric == faiss.METRIC_INNER_PRODUCT:
+        return (sum_of_R / sum_of_D_GT).item()
+    elif metric == faiss.METRIC_L2:
+        return (sum_of_D_GT / sum_of_R).item()
+    else:
+        raise RuntimeError(f"unknown metric {metric}")
+
+
+@functools.cache
+def get_cpu_info():
+    return [l for l in open("/proc/cpuinfo", "r") if "model name" in l][0][
+        13:
+    ].strip()
+
+
+def dict_merge(target, source):
+    for k, v in source.items():
+        if isinstance(v, dict) and k in target:
+            dict_merge(target[k], v)
+        else:
+            target[k] = v
+
+
+class Cost:
+    def __init__(self, values):
+        self.values = values
+
+    def __le__(self, other):
+        return all(
+            v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+    def __lt__(self, other):
+        return all(
+            v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True)
+        )
+
+
+class ParetoMode(Enum):
+    DISABLE = 1  # no Pareto filtering
+    INDEX = 2  # index-local optima
+    GLOBAL = 3  # global optima
+
+
+class ParetoMetric(Enum):
+    TIME = 0  # time vs accuracy
+    SPACE = 1  # space vs accuracy
+    TIME_SPACE = 2  # (time, space) vs accuracy
+
+
+def range_search_recall_at_precision(experiment, precision):
+    return round(
+        max(
+            r
+            for r, p in zip(
+                experiment["range_search_pr"]["recall"],
+                experiment["range_search_pr"]["precision"],
+            )
+            if p > precision
+        ),
+        6,
+    )
+
+
+def filter_results(
+    results,
+    evaluation,
+    accuracy_metric,  # str or func
+    time_metric=None,  # func or None -> use default
+    space_metric=None,  # func or None -> use default
+    min_accuracy=0,
+    max_space=0,
+    max_time=0,
+    scaling_factor=1.0,
+    name_filter=None,  # func
+    pareto_mode=ParetoMode.DISABLE,
+    pareto_metric=ParetoMetric.TIME,
+):
+    if isinstance(accuracy_metric, str):
+        accuracy_key = accuracy_metric
+        accuracy_metric = lambda v: v[accuracy_key]
+
+    if time_metric is None:
+        time_metric = lambda v: v["time"] * scaling_factor + (
+            v["quantizer"]["time"] if "quantizer" in v else 0
+        )
+
+    if space_metric is None:
+        space_metric = lambda v: results["indices"][v["codec"]]["code_size"]
+
+    fe = []
+    ops = {}
+    if pareto_mode == ParetoMode.GLOBAL:
+        op = OperatingPoints()
+        ops["global"] = op
+    for k, v in results["experiments"].items():
+        if f".{evaluation}" in k:
+            accuracy = accuracy_metric(v)
+            if min_accuracy > 0 and accuracy < min_accuracy:
+                continue
+            space = space_metric(v)
+            if space is None:
+                space = 0
+            if max_space > 0 and space > max_space:
+                continue
+            time = time_metric(v)
+            if max_time > 0 and time > max_time:
+                continue
+            idx_name = v["index"] + (
+                "snap"
+                if "search_params" in v and v["search_params"]["snap"] == 1
+                else ""
+            )
+            if name_filter is not None and not name_filter(idx_name):
+                continue
+            experiment = (accuracy, space, time, k, v)
+            if pareto_mode == ParetoMode.DISABLE:
+                fe.append(experiment)
+                continue
+            if pareto_mode == ParetoMode.INDEX:
+                if idx_name not in ops:
+                    ops[idx_name] = OperatingPoints()
+                op = ops[idx_name]
+            if pareto_metric == ParetoMetric.TIME:
+                op.add_operating_point(experiment, accuracy, time)
+            elif pareto_metric == ParetoMetric.SPACE:
+                op.add_operating_point(experiment, accuracy, space)
+            else:
+                op.add_operating_point(
+                    experiment, accuracy, Cost([time, space])
+                )
+
+    if ops:
+        for op in ops.values():
+            for v, _, _ in op.operating_points:
+                fe.append(v)
+
+    fe.sort()
+    return fe
diff --git a/benchs/bench_fw_codecs.py b/benchs/bench_fw_codecs.py
new file mode 100644
index 0000000000..80741e23f7
--- /dev/null
+++ b/benchs/bench_fw_codecs.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import argparse
+import os
+
+from bench_fw.benchmark import Benchmark
+from bench_fw.benchmark_io import BenchmarkIO
+from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from bench_fw.index import IndexFromFactory
+
+logging.basicConfig(level=logging.INFO)
+
+def factory_factory(d):
+    return [
+        ("SQ4", None, 256 * (2 ** 10), None),
+        ("SQ8", None, 256 * (2 ** 10), None),
+        ("SQfp16", None, 256 * (2 ** 10), None),
+        ("ITQ64,LSH", None, 256 * (2 ** 10), None),
+        ("Pad128,ITQ128,LSH", None, 256 * (2 ** 10), None),
+        ("Pad256,ITQ256,LSH", None, 256 * (2 ** 10), None),
+    ] + [
+        (f"OPQ32_128,Residual2x14,PQ32x{b}", None, 256 * (2 ** 14), None)
+        for b in range(8, 16, 2)
+    ] + [
+        (f"PCAR{2 ** d_out},SQ{b}", None, 256 * (2 ** 10), None)
+        for d_out in range(6, 11) 
+        if 2 ** d_out <= d
+        for b in [4, 8]
+    ] + [
+        (f"OPQ{M}_{M * dim},PQ{M}x{b}", None, 256 * (2 ** b), None)
+        for M in [8, 12, 16, 32, 64, 128]
+        for dim in [2, 4, 6, 8, 12, 16]
+        if M * dim <= d
+        for b in range(8, 16, 2)
+    ] + [
+        (f"RQ{cs // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl}) 
+        for cs in [64, 128, 256, 512]
+        for b in [6, 8, 10, 12]
+        for bs in [1, 2, 4, 8, 16, 32]
+        for bl in [0, 1]
+        if cs // b > 1
+        if cs // b < 65
+        if cs < d * 8 * 2
+    ] + [
+        (f"LSQ{cs // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
+        for cs in [64, 128, 256, 512]
+        for b in [6, 8, 10, 12]
+        for eii in [2, 4, 8, 16]
+        for lg in [0, 1]
+        if cs // b > 1
+        if cs // b < 65
+        if cs < d * 8 * 2
+    ] + [
+        (f"PRQ{sub}x{cs // sub // b}x{b}", [{"max_beam_size": 32}], 256 * (2 ** b), {"max_beam_size": bs, "use_beam_LUT": bl})
+        for sub in [2, 3, 4, 8, 16, 32]
+        for cs in [64, 96, 128, 192, 256, 384, 512, 768, 1024, 2048]
+        for b in [6, 8, 10, 12]
+        for bs in [1, 2, 4, 8, 16, 32]
+        for bl in [0, 1]
+        if cs // sub // b > 1
+        if cs // sub // b < 65
+        if cs < d * 8 * 2
+        if d % sub == 0
+    ] + [
+        (f"PLSQ{sub}x{cs // sub // b}x{b}", [{"encode_ils_iters": 16}], 256 * (2 ** b), {"encode_ils_iters": eii, "lsq_gpu": lg}) 
+        for sub in [2, 3, 4, 8, 16, 32]
+        for cs in [64, 128, 256, 512, 1024, 2048]
+        for b in [6, 8, 10, 12]
+        for eii in [2, 4, 8, 16]
+        for lg in [0, 1]
+        if cs // sub // b > 1
+        if cs // sub // b < 65
+        if cs < d * 8 * 2
+        if d % sub == 0
+    ]
+
+def run_local(rp):
+    bio, d, tablename, distance_metric = rp
+    if tablename == "contriever":
+        training_vectors=DatasetDescriptor(
+            tablename="training_set.npy"
+        )
+        database_vectors=DatasetDescriptor(
+            tablename="database1M.npy",
+        )
+        query_vectors=DatasetDescriptor(
+            tablename="queries.npy",
+        )
+    else:
+        training_vectors=DatasetDescriptor(
+            namespace="std_t", tablename=tablename,
+        )
+        database_vectors=DatasetDescriptor(
+            namespace="std_d", tablename=tablename,
+        )
+        query_vectors=DatasetDescriptor(
+            namespace="std_q", tablename=tablename,
+        )
+
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=training_vectors,
+        database_vectors=database_vectors,
+        query_vectors=query_vectors,
+        index_descs=[
+            IndexDescriptor(
+                factory=factory,
+                construction_params=construction_params,
+                training_size=training_size,
+                search_params=search_params,
+            )
+            for factory, construction_params, training_size, search_params in factory_factory(d)
+        ],
+        k=1,
+        distance_metric=distance_metric,
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark(result_file="result.json", train=False, reconstruct=False, knn=False, range=False)
+
+def run(bio, d, tablename, distance_metric):
+    bio.launch_jobs(run_local, [(bio, d, tablename, distance_metric)], local=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "sift1M":
+        run(bio, 128, "sift1M", "L2")
+    elif args.experiment == "bigann":
+        run(bio, 128, "bigann1M", "L2")
+    elif args.experiment == "deep1b":
+        run(bio, 96, "deep1M", "L2")
+    elif args.experiment == "contriever":
+        run(bio, 768, "contriever", "IP")
diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py
new file mode 100644
index 0000000000..e9e144c569
--- /dev/null
+++ b/benchs/bench_fw_ivf.py
@@ -0,0 +1,125 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptor,
+)
+
+logging.basicConfig(level=logging.INFO)
+
+
+def sift1M(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            namespace="std_d", tablename="sift1M"
+        ),
+        database_vectors=DatasetDescriptor(
+            namespace="std_d", tablename="sift1M"
+        ),
+        query_vectors=DatasetDescriptor(
+            namespace="std_q", tablename="sift1M"
+        ),
+        index_descs=[
+            IndexDescriptor(
+                factory=f"IVF{2 ** nlist},Flat",
+            )
+            for nlist in range(8, 15)
+        ],
+        k=1,
+        distance_metric="L2",
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+
+
+def bigann(bio):
+    for scale in [1, 2, 5, 10, 20, 50]:
+        benchmark = Benchmark(
+            num_threads=32,
+            training_vectors=DatasetDescriptor(
+                namespace="std_t", tablename="bigann1M"
+            ),
+            database_vectors=DatasetDescriptor(
+                namespace="std_d", tablename=f"bigann{scale}M"
+            ),
+            query_vectors=DatasetDescriptor(
+                namespace="std_q", tablename="bigann1M"
+            ),
+            index_descs=[
+                IndexDescriptor(
+                    factory=f"IVF{2 ** nlist},Flat",
+                ) for nlist in range(11, 19)
+            ] + [
+                IndexDescriptor(
+                    factory=f"IVF{2 ** nlist}_HNSW32,Flat",
+                    construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
+                ) for nlist in range(11, 19)
+            ],
+            k=1,
+            distance_metric="L2",
+        )
+        benchmark.set_io(bio)
+        benchmark.benchmark(f"result{scale}.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+
+def ssnpp(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            tablename="ssnpp_training_5M.npy"
+        ),
+        database_vectors=DatasetDescriptor(
+            tablename="ssnpp_database_5M.npy"
+        ),
+        query_vectors=DatasetDescriptor(
+            tablename="ssnpp_queries_10K.npy"
+        ),
+        index_descs=[
+            IndexDescriptor(
+                factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
+            ) for nlist in range(9, 16)
+        ] + [
+            IndexDescriptor(
+                factory=f"IVF{2 ** nlist},Flat",
+            ) for nlist in range(9, 16)
+        ] + [
+            IndexDescriptor(
+                factory=f"PQ256x4fs,Refine(SQfp16)",
+            ),
+            IndexDescriptor(
+                factory=f"HNSW32",
+            ),
+        ],
+        k=1,
+        distance_metric="L2",
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "sift1M":
+        sift1M(bio)
+    elif args.experiment == "bigann":
+        bigann(bio)
+    elif args.experiment == "ssnpp":
+        ssnpp(bio)
diff --git a/benchs/bench_fw_ivf_flat.py b/benchs/bench_fw_ivf_flat.py
deleted file mode 100644
index 37b4bd7862..0000000000
--- a/benchs/bench_fw_ivf_flat.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
-
-logging.basicConfig(level=logging.INFO)
-
-benchmark = Benchmark(
-    training_vectors=DatasetDescriptor(
-        namespace="std_d", tablename="sift1M"
-    ),
-    database_vectors=DatasetDescriptor(
-        namespace="std_d", tablename="sift1M"
-    ),
-    query_vectors=DatasetDescriptor(
-        namespace="std_q", tablename="sift1M"
-    ),
-    index_descs=[
-        IndexDescriptor(
-            factory=f"IVF{2 ** nlist},Flat",
-        )
-        for nlist in range(8, 15)
-    ],
-    k=1,
-    distance_metric="L2",
-)
-io = BenchmarkIO(
-    path="/checkpoint",
-)
-benchmark.set_io(io)
-print(benchmark.benchmark("result.json"))
diff --git a/benchs/bench_fw_notebook.ipynb b/benchs/bench_fw_notebook.ipynb
index 7cc39ea2cb..5752aaf5fb 100644
--- a/benchs/bench_fw_notebook.ipynb
+++ b/benchs/bench_fw_notebook.ipynb
@@ -1,289 +1,529 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "be081589-e1b2-4569-acb7-44203e273899",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import itertools\n",
-    "from faiss.contrib.evaluation import OperatingPoints\n",
-    "from enum import Enum\n",
-    "from bench_fw.benchmark_io import BenchmarkIO as BIO"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "root = \"/checkpoint\"\n",
-    "results = BIO(root).read_json(\"result.json\")\n",
-    "results.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0875d269-aef4-426d-83dd-866970f43777",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results['indices']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a7ff7078-29c7-407c-a079-201877b764ad",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Cost:\n",
-    "    def __init__(self, values):\n",
-    "        self.values = values\n",
-    "\n",
-    "    def __le__(self, other):\n",
-    "        return all(v1 <= v2 for v1, v2 in zip(self.values, other.values, strict=True))\n",
-    "\n",
-    "    def __lt__(self, other):\n",
-    "        return all(v1 < v2 for v1, v2 in zip(self.values, other.values, strict=True))\n",
-    "\n",
-    "class ParetoMode(Enum):\n",
-    "    DISABLE = 1  # no Pareto filtering\n",
-    "    INDEX = 2    # index-local optima\n",
-    "    GLOBAL = 3   # global optima\n",
-    "\n",
-    "\n",
-    "class ParetoMetric(Enum):\n",
-    "    TIME = 0        # time vs accuracy\n",
-    "    SPACE = 1       # space vs accuracy\n",
-    "    TIME_SPACE = 2  # (time, space) vs accuracy\n",
-    "\n",
-    "def range_search_recall_at_precision(experiment, precision):\n",
-    "    return round(max(r for r, p in zip(experiment['range_search_pr']['recall'], experiment['range_search_pr']['precision']) if p > precision), 6)\n",
-    "\n",
-    "def filter_results(\n",
-    "    results,\n",
-    "    evaluation,\n",
-    "    accuracy_metric, # str or func\n",
-    "    time_metric=None, # func or None -> use default\n",
-    "    space_metric=None, # func or None -> use default\n",
-    "    min_accuracy=0,\n",
-    "    max_space=0,\n",
-    "    max_time=0,\n",
-    "    scaling_factor=1.0,\n",
-    "    \n",
-    "    pareto_mode=ParetoMode.DISABLE,\n",
-    "    pareto_metric=ParetoMetric.TIME,\n",
-    "):\n",
-    "    if isinstance(accuracy_metric, str):\n",
-    "        accuracy_key = accuracy_metric\n",
-    "        accuracy_metric = lambda v: v[accuracy_key]\n",
-    "\n",
-    "    if time_metric is None:\n",
-    "        time_metric = lambda v: v['time'] * scaling_factor + (v['quantizer']['time'] if 'quantizer' in v else 0)\n",
-    "\n",
-    "    if space_metric is None:\n",
-    "        space_metric = lambda v: results['indices'][v['codec']]['code_size']\n",
-    "    \n",
-    "    fe = []\n",
-    "    ops = {}\n",
-    "    if pareto_mode == ParetoMode.GLOBAL:\n",
-    "        op = OperatingPoints()\n",
-    "        ops[\"global\"] = op\n",
-    "    for k, v in results['experiments'].items():\n",
-    "        if f\".{evaluation}\" in k:\n",
-    "            accuracy = accuracy_metric(v)\n",
-    "            if min_accuracy > 0 and accuracy < min_accuracy:\n",
-    "                continue\n",
-    "            space = space_metric(v)\n",
-    "            if max_space > 0 and space > max_space:\n",
-    "                continue\n",
-    "            time = time_metric(v)\n",
-    "            if max_time > 0 and time > max_time:\n",
-    "                continue\n",
-    "            idx_name = v['index']\n",
-    "            experiment = (accuracy, space, time, k, v)\n",
-    "            if pareto_mode == ParetoMode.DISABLE:\n",
-    "                fe.append(experiment)\n",
-    "                continue\n",
-    "            if pareto_mode == ParetoMode.INDEX:\n",
-    "                if idx_name not in ops:\n",
-    "                    ops[idx_name] = OperatingPoints()\n",
-    "                op = ops[idx_name]\n",
-    "            if pareto_metric == ParetoMetric.TIME:\n",
-    "                op.add_operating_point(experiment, accuracy, time)\n",
-    "            elif pareto_metric == ParetoMetric.SPACE:\n",
-    "                op.add_operating_point(experiment, accuracy, space)\n",
-    "            else:\n",
-    "                op.add_operating_point(experiment, accuracy, Cost([time, space]))\n",
-    "\n",
-    "    if ops:\n",
-    "        for op in ops.values():\n",
-    "            for v, _, _ in op.operating_points:\n",
-    "                fe.append(v)\n",
-    "\n",
-    "    fe.sort()\n",
-    "    return fe"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f080a6e2-1565-418b-8732-4adeff03a099",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False):\n",
-    "    x = {}\n",
-    "    y = {}\n",
-    "    for accuracy, space, time, k, v in experiments:\n",
-    "        idx_name = v['index']\n",
-    "        if idx_name not in x:\n",
-    "            x[idx_name] = []\n",
-    "            y[idx_name] = []\n",
-    "        x[idx_name].append(accuracy)\n",
-    "        if plot_space:\n",
-    "            y[idx_name].append(space)\n",
-    "        else:\n",
-    "            y[idx_name].append(time)\n",
-    "\n",
-    "    #plt.figure(figsize=(10,6))\n",
-    "    plt.yscale(\"log\")\n",
-    "    plt.title(accuracy_title)\n",
-    "    plt.xlabel(accuracy_title)\n",
-    "    plt.ylabel(cost_title)\n",
-    "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-    "    for index in x.keys():\n",
-    "        plt.plot(x[index], y[index], marker=next(marker), label=index)\n",
-    "    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "61007155-5edc-449e-835e-c141a01a2ae5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "accuracy_metric = \"knn_intersection\"\n",
-    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36e82084-18f6-4546-a717-163eb0224ee8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# index local optima\n",
-    "precision = 0.2\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# global optima\n",
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def plot_range_search_pr_curves(experiments):\n",
-    "    x = {}\n",
-    "    y = {}\n",
-    "    show = {\n",
-    "        'Flat': None,\n",
-    "    }\n",
-    "    for _, _, _, k, v in fr:\n",
-    "        if \".weighted\" in k: # and v['index'] in show:\n",
-    "            x[k] = v['range_search_pr']['recall']\n",
-    "            y[k] = v['range_search_pr']['precision']\n",
-    "    \n",
-    "    plt.title(\"range search recall\")\n",
-    "    plt.xlabel(\"recall\")\n",
-    "    plt.ylabel(\"precision\")\n",
-    "    for index in x.keys():\n",
-    "        plt.plot(x[index], y[index], '.', label=index)\n",
-    "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "precision = 0.8\n",
-    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
-    "plot_range_search_pr_curves(fr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python [conda env:faiss_cpu_from_source] *",
-   "language": "python",
-   "name": "conda-env-faiss_cpu_from_source-py"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "cells": [
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "be081589-e1b2-4569-acb7-44203e273899",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "import matplotlib.pyplot as plt\n",
+       "import itertools\n",
+       "from faiss.contrib.evaluation import OperatingPoints\n",
+       "from enum import Enum\n",
+       "from bench_fw.benchmark_io import BenchmarkIO as BIO\n",
+       "from bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
+       "from copy import copy\n",
+       "import numpy as np\n",
+       "import datetime\n",
+       "import glob\n",
+       "import io\n",
+       "import json\n",
+       "from zipfile import ZipFile\n",
+       "import tabulate"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "root = \"/checkpoint/gsz/bench_fw/optimize/bigann\"\n",
+       "results = BIO(root).read_json(\"result_std_d_bigann10M.json\")\n",
+       "results.keys()"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0875d269-aef4-426d-83dd-866970f43777",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "results['experiments']"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f080a6e2-1565-418b-8732-4adeff03a099",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
+       "    if plot is None:\n",
+       "        plot = plt.subplot()\n",
+       "    x = {}\n",
+       "    y = {}\n",
+       "    for accuracy, space, time, k, v in experiments:\n",
+       "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
+       "        if idx_name not in x:\n",
+       "            x[idx_name] = []\n",
+       "            y[idx_name] = []\n",
+       "        x[idx_name].append(accuracy)\n",
+       "        if plot_space:\n",
+       "            y[idx_name].append(space)\n",
+       "        else:\n",
+       "            y[idx_name].append(time)\n",
+       "\n",
+       "    #plt.figure(figsize=(10,6))\n",
+       "    #plt.title(accuracy_title)\n",
+       "    plot.set_xlabel(accuracy_title)\n",
+       "    plot.set_ylabel(cost_title)\n",
+       "    plot.set_yscale(\"log\")\n",
+       "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+       "    for index in x.keys():\n",
+       "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
+       "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "61007155-5edc-449e-835e-c141a01a2ae5",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "# index local optima\n",
+       "accuracy_metric = \"knn_intersection\"\n",
+       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
+       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "# global optima\n",
+       "accuracy_metric = \"knn_intersection\"\n",
+       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "def pretty_params(p):\n",
+       "    p = copy(p)\n",
+       "    if 'snap' in p and p['snap'] == 0:\n",
+       "        del p['snap']\n",
+       "    return p\n",
+       "    \n",
+       "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
+       "                for accuracy, space, time, k, v in fr],\n",
+       "                tablefmt=\"html\",\n",
+       "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "36e82084-18f6-4546-a717-163eb0224ee8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "# index local optima @ precision 0.8\n",
+       "precision = 0.8\n",
+       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "# index local optima @ precision 0.2\n",
+       "precision = 0.2\n",
+       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "# global optima @ precision 0.8\n",
+       "precision = 0.8\n",
+       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "def plot_range_search_pr_curves(experiments):\n",
+       "    x = {}\n",
+       "    y = {}\n",
+       "    show = {\n",
+       "        'Flat': None,\n",
+       "    }\n",
+       "    for _, _, _, k, v in fr:\n",
+       "        if \".weighted\" in k: # and v['index'] in show:\n",
+       "            x[k] = v['range_search_pr']['recall']\n",
+       "            y[k] = v['range_search_pr']['precision']\n",
+       "    \n",
+       "    plt.title(\"range search recall\")\n",
+       "    plt.xlabel(\"recall\")\n",
+       "    plt.ylabel(\"precision\")\n",
+       "    for index in x.keys():\n",
+       "        plt.plot(x[index], y[index], '.', label=index)\n",
+       "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "precision = 0.8\n",
+       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
+       "plot_range_search_pr_curves(fr)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
+      "metadata": {
+       "tags": []
+      },
+      "outputs": [],
+      "source": [
+       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+       "scales = [1, 2, 5, 10, 20, 50]\n",
+       "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+       "fig.tight_layout()\n",
+       "for plot, scale in zip(plots, scales, strict=True):\n",
+       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+       "    accuracy_metric = \"knn_intersection\"\n",
+       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e503828c-ee61-45f7-814b-cce6461109bc",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "x = {}\n",
+       "y = {}\n",
+       "accuracy=0.9\n",
+       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+       "scales = [1, 2, 5, 10, 20, 50]\n",
+       "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+       "#fig.tight_layout()\n",
+       "for scale in scales:\n",
+       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+       "    scale *= 1_000_000\n",
+       "    accuracy_metric = \"knn_intersection\"\n",
+       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "    seen = set()\n",
+       "    print(scale)\n",
+       "    for _, _, _, _, exp in fr:\n",
+       "        fact = exp[\"factory\"]\n",
+       "        # \"HNSW\" in fact or \n",
+       "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+       "            continue\n",
+       "        seen.add(fact)\n",
+       "        if fact not in x:\n",
+       "            x[fact] = []\n",
+       "            y[fact] = []\n",
+       "        x[fact].append(scale)\n",
+       "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
+       "        if (exp[\"knn_intersection\"] > 0.92):\n",
+       "            print(fact)\n",
+       "            print(exp[\"search_params\"])\n",
+       "            print(exp[\"knn_intersection\"])\n",
+       "\n",
+       "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
+       "    \n",
+       "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
+       "plt.xlabel(\"database size\")\n",
+       "plt.ylabel(\"time\")\n",
+       "plt.xscale(\"log\")\n",
+       "plt.yscale(\"log\")\n",
+       "\n",
+       "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+       "for index in x.keys():\n",
+       "    if \"HNSW\" in index:\n",
+       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
+       "    else:\n",
+       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
+       "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "# global optima\n",
+       "accuracy_metric = \"sym_recall\"\n",
+       "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
+       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "def pretty_time(s):\n",
+       "    if s is None:\n",
+       "        return \"None\"\n",
+       "    s = int(s * 1000) / 1000\n",
+       "    m, s = divmod(s, 60)\n",
+       "    h, m = divmod(m, 60)\n",
+       "    d, h = divmod(h, 24)\n",
+       "    r = \"\"\n",
+       "    if d > 0:\n",
+       "        r += f\"{int(d)}d \"\n",
+       "    if h > 0:\n",
+       "        r += f\"{int(h)}h \"\n",
+       "    if m > 0:\n",
+       "        r += f\"{int(m)}m \"\n",
+       "    if s > 0 or len(r) == 0:\n",
+       "        r += f\"{s:.3f}s\"\n",
+       "    return r\n",
+       "\n",
+       "def pretty_size(s):\n",
+       "    if s > 1024 * 1024:\n",
+       "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
+       "    if s > 1024:\n",
+       "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
+       "    return f\"{s}\"\n",
+       "\n",
+       "def pretty_mse(m):\n",
+       "    if m is None:\n",
+       "        return \"None\"\n",
+       "    else:\n",
+       "        return f\"{m:.6f}\""
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "data = {}\n",
+       "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
+       "scales = [1, 2, 5, 10, 20, 50]\n",
+       "for scale in scales:\n",
+       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+       "    accuracy_metric = \"knn_intersection\"\n",
+       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+       "    d = {}\n",
+       "    data[f\"{scale}M\"] = d\n",
+       "    for _, _, _, _, exp in fr:\n",
+       "        fact = exp[\"factory\"]\n",
+       "        # \"HNSW\" in fact or \n",
+       "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+       "            continue\n",
+       "        if fact not in d:\n",
+       "            d[fact] = []\n",
+       "        d[fact].append({\n",
+       "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
+       "            \"recall\": exp[\"knn_intersection\"],\n",
+       "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
+       "        })\n",
+       "data\n",
+       "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
+       "#    json.dump(data, f)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "ds = \"deep1b\"\n",
+       "data = []\n",
+       "jss = []\n",
+       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+       "results = BIO(root).read_json(f\"result.json\")\n",
+       "for k, e in results[\"experiments\"].items():\n",
+       "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+       "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
+       "        codec_size = results['indices'][e['codec']]['codec_size']\n",
+       "        training_time = results['indices'][e['codec']]['training_time']\n",
+       "        # training_size = results['indices'][e['codec']]['training_size']\n",
+       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+       "        jss.append({\n",
+       "            'factory': e['factory'],\n",
+       "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
+       "            'evaluation_params': e['reconstruct_params'],\n",
+       "            'code_size': code_size,\n",
+       "            'codec_size': codec_size,\n",
+       "            'training_time': training_time,\n",
+       "            'training_size': training_size,\n",
+       "            'mse': e['mse'],\n",
+       "            'sym_recall': e['sym_recall'],\n",
+       "            'asym_recall': e['asym_recall'],\n",
+       "            'encode_time': e['encode_time'],\n",
+       "            'decode_time': e['decode_time'],\n",
+       "            'cpu': cpu,\n",
+       "        })\n",
+       "\n",
+       "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+       "data.sort()\n",
+       "for d in data:\n",
+       "    print(d[1])\n",
+       "\n",
+       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
+       "    json.dump(jss, f)"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "def read_file(filename: str, keys):\n",
+       "    results = []\n",
+       "    with ZipFile(filename, \"r\") as zip_file:\n",
+       "        for key in keys:\n",
+       "            with zip_file.open(key, \"r\") as f:\n",
+       "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
+       "                    results.append(np.load(f))\n",
+       "                elif key in [\"P\"]:\n",
+       "                    t = io.TextIOWrapper(f)\n",
+       "                    results.append(json.load(t))\n",
+       "                else:\n",
+       "                    raise AssertionError()\n",
+       "    return results"
+      ]
+     },
+     {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+       "ds = \"contriever\"\n",
+       "data = []\n",
+       "jss = []\n",
+       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+       "for lf in glob.glob(root + '/*rec*.zip'):\n",
+       "    e, = read_file(lf, ['P'])\n",
+       "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+       "        code_size = e['codec_meta']['sa_code_size']\n",
+       "        codec_size = e['codec_meta']['codec_size']\n",
+       "        training_time = e['codec_meta']['training_time']\n",
+       "        training_size = None # e['codec_meta']['training_size']\n",
+       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+       "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
+       "           eps = \" \"\n",
+       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+       "        eps = e['reconstruct_params']\n",
+       "        del eps['snap']\n",
+       "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
+       "        for k, v in e['reconstruct_params'].items():\n",
+       "            params[k] = v\n",
+       "        jss.append({\n",
+       "            'factory': e['factory'],\n",
+       "            'params': params,\n",
+       "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
+       "            'evaluation_params': e['reconstruct_params'],\n",
+       "            'code_size': code_size,\n",
+       "            'codec_size': codec_size,\n",
+       "            'training_time': training_time,\n",
+       "            # 'training_size': training_size,\n",
+       "            'mse': e['mse'],\n",
+       "            'sym_recall': e['sym_recall'],\n",
+       "            'asym_recall': e['asym_recall'],\n",
+       "            'encode_time': e['encode_time'],\n",
+       "            'decode_time': e['decode_time'],\n",
+       "            'cpu': cpu,\n",
+       "        })\n",
+       "\n",
+       "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+       "data.sort()\n",
+       "# for d in data:\n",
+       "#   print(d[1])\n",
+       "\n",
+       "print(len(data))\n",
+       "\n",
+       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
+       "    json.dump(jss, f)"
+      ]
+     }
+    ],
+    "metadata": {
+     "kernelspec": {
+      "display_name": "Python [conda env:.conda-faiss_from_source] *",
+      "language": "python",
+      "name": "conda-env-.conda-faiss_from_source-py"
+     },
+     "language_info": {
+      "codemirror_mode": {
+       "name": "ipython",
+       "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.5"
+     }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+   }
diff --git a/benchs/bench_fw_optimize.py b/benchs/bench_fw_optimize.py
new file mode 100644
index 0000000000..31b56f9f51
--- /dev/null
+++ b/benchs/bench_fw_optimize.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+
+from bench_fw.benchmark_io import BenchmarkIO
+from bench_fw.descriptors import DatasetDescriptor
+from bench_fw.optimize import Optimizer
+
+logging.basicConfig(level=logging.INFO)
+
+
+def bigann(bio):
+    optimizer = Optimizer(
+        distance_metric="L2",
+        num_threads=32,
+        run_local=False,
+    )
+    optimizer.set_io(bio)
+    query_vectors = DatasetDescriptor(namespace="std_q", tablename="bigann1M")
+    xt = bio.get_dataset(query_vectors)
+    optimizer.optimize(
+        d=xt.shape[1],
+        training_vectors=DatasetDescriptor(
+            namespace="std_t",
+            tablename="bigann1M",
+            num_vectors=2_000_000,
+        ),
+        database_vectors_list=[
+            DatasetDescriptor(
+                namespace="std_d",
+                tablename="bigann1M",
+            ),
+            DatasetDescriptor(namespace="std_d", tablename="bigann10M"),
+        ],
+        query_vectors=query_vectors,
+        min_accuracy=0.85,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("experiment")
+    parser.add_argument("path")
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "bigann":
+        bigann(bio)
diff --git a/benchs/bench_fw_range.py b/benchs/bench_fw_range.py
new file mode 100644
index 0000000000..f38de114f9
--- /dev/null
+++ b/benchs/bench_fw_range.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import argparse
+import os
+
+from bench_fw.benchmark import Benchmark
+from bench_fw.benchmark_io import BenchmarkIO
+from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+
+logging.basicConfig(level=logging.INFO)
+
+def ssnpp(bio):
+    benchmark = Benchmark(
+        num_threads=32,
+        training_vectors=DatasetDescriptor(
+            tablename="ssnpp_training_5M.npy",
+        ),
+        database_vectors=DatasetDescriptor(
+            tablename="ssnpp_xb_range_filtered_119201.npy",
+        ),
+        query_vectors=DatasetDescriptor(tablename="ssnpp_xq_range_filtered_33615.npy"),
+        index_descs=[
+            IndexDescriptor(
+                factory="Flat",
+                range_metrics={
+                    "weighted": [
+                        [0.05, 0.971],
+                        [0.1, 0.956],
+                        [0.15, 0.923],
+                        [0.2, 0.887],
+                        [0.25, 0.801],
+                        [0.3, 0.729], 
+                        [0.35, 0.651], 
+                        [0.4, 0.55], 
+                        [0.45, 0.459], 
+                        [0.5, 0.372], 
+                        [0.55, 0.283], 
+                        [0.6, 0.189], 
+                        [0.65, 0.143], 
+                        [0.7, 0.106], 
+                        [0.75, 0.116], 
+                        [0.8, 0.088], 
+                        [0.85, 0.064],
+                        [0.9, 0.05], 
+                        [0.95, 0.04], 
+                        [1.0, 0.028], 
+                        [1.05, 0.02], 
+                        [1.1, 0.013],
+                        [1.15, 0.007], 
+                        [1.2, 0.004], 
+                        [1.3, 0],
+                    ]
+                },
+            ),
+            IndexDescriptor(
+                factory="IVF262144(PQ256x4fs),PQ32",
+            ),
+        ],
+        k=10,
+        distance_metric="L2",
+        range_ref_index_desc="Flat",
+    )
+    benchmark.set_io(bio)
+    benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment')
+    parser.add_argument('path')
+    args = parser.parse_args()
+    assert os.path.exists(args.path)
+    path = os.path.join(args.path, args.experiment)
+    if not os.path.exists(path):
+        os.mkdir(path)
+    bio = BenchmarkIO(
+        path=path,
+    )
+    if args.experiment == "ssnpp":
+        ssnpp(bio)
diff --git a/benchs/bench_fw_test.py b/benchs/bench_fw_test.py
deleted file mode 100644
index 55b9e16e65..0000000000
--- a/benchs/bench_fw_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
-
-logging.basicConfig(level=logging.INFO)
-
-benchmark = Benchmark(
-    training_vectors=DatasetDescriptor(
-        tablename="training.npy", num_vectors=200000
-    ),
-    database_vectors=DatasetDescriptor(
-        tablename="database.npy", num_vectors=200000
-    ),
-    query_vectors=DatasetDescriptor(tablename="query.npy", num_vectors=2000),
-    index_descs=[
-        IndexDescriptor(
-            factory="Flat",
-            range_metrics={
-                "weighted": [
-                    [0.1, 0.928],
-                    [0.2, 0.865],
-                    [0.3, 0.788],
-                    [0.4, 0.689],
-                    [0.5, 0.49],
-                    [0.6, 0.308],
-                    [0.7, 0.193],
-                    [0.8, 0.0],
-                ]
-            },
-        ),
-        IndexDescriptor(
-            factory="OPQ32_128,IVF512,PQ32",
-        ),
-        IndexDescriptor(
-            factory="OPQ32_256,IVF512,PQ32",
-        ),
-        IndexDescriptor(
-            factory="HNSW32",
-            construction_params=[
-                {
-                    "efConstruction": 64,
-                }
-            ],
-        ),
-    ],
-    k=10,
-    distance_metric="L2",
-    range_ref_index_desc="Flat",
-)
-io = BenchmarkIO(
-    path="/checkpoint",
-)
-benchmark.set_io(io)
-print(benchmark.benchmark("result.json"))
diff --git a/benchs/bench_ivfflat_raft.py b/benchs/bench_ivfflat_raft.py
new file mode 100644
index 0000000000..9ebfcb3422
--- /dev/null
+++ b/benchs/bench_ivfflat_raft.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+
+aa('--bm_train', default=False, action='store_true',
+   help='whether to benchmark train operation on GPU index')
+aa('--bm_add', default=False, action='store_true',
+   help='whether to benchmark add operation on GPU index')
+aa('--bm_search', default=True,
+   help='whether to benchmark search operation on GPU index')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+
+group = parser.add_argument_group('IVF options')
+aa('--n_centroids', default=256, type=int,
+    help="number of IVF centroids")
+
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_train:
+    print("=" * 40)
+    print("GPU Train Benchmarks")
+    print("=" * 40)
+    trainset_sizes = [5000, 10000, 100000, 1000000, 5000000]
+    dataset_dims = [128, 256, 1024]
+    for n_rows in trainset_sizes:
+        for n_cols in dataset_dims:
+            index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+            trainVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_train_time = bench_train_milliseconds(
+                index, trainVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, raft_gpu_train_time))
+            else:
+                classical_gpu_train_time = bench_train_milliseconds(
+                    index, trainVecs, False)
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_add:
+    print("=" * 40)
+    print("GPU Add Benchmarks")
+    print("=" * 40)
+    addset_sizes = [5000, 10000, 100000, 1000000]
+    dataset_dims = [128, 256, 1024]
+    n_train = 10000
+    trainVecs = rs.rand(n_train, n_cols).astype('float32')
+    index = faiss.index_factory(
+        n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+    index.train(trainVecs)
+    for n_rows in addset_sizes:
+        for n_cols in dataset_dims:
+            addVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time))
+            else:
+                classical_gpu_add_time = bench_add_milliseconds(
+                    index, addVecs, False)
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [5000, 10000, 100000, 500000]
+    n_train = 10000
+    n_add = 100000
+    search_bm_dims = [8, 16, 32]
+    for n_cols in search_bm_dims:
+        index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+            else:
+                classical_gpu_search_time = bench_search_milliseconds(
+                    index, addVecs, queryVecs, args.nprobe, args.k, False)
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
+
+    print("=" * 40)
+    print("Large RAFT Enabled Benchmarks")
+    print("=" * 40)
+    # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
+    queryset_sizes = [100000, 500000, 1000000]
+    large_search_bm_dims = [128, 256, 1024]
+    for n_cols in large_search_bm_dims:
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index = faiss.index_factory(
+            n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
diff --git a/benchs/bench_ivfpq_raft.py b/benchs/bench_ivfpq_raft.py
new file mode 100644
index 0000000000..3494a18741
--- /dev/null
+++ b/benchs/bench_ivfpq_raft.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+group = parser.add_argument_group('IVF options')
+aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
+aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
+aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)')
+
+group = parser.add_argument_group('searching')
+aa('--k', default=10, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+# A heuristic to select a suitable number of lists
+def compute_nlist(numVecs):
+    nlist = np.sqrt(numVecs)
+    if (numVecs / nlist < 1000):
+        nlist = numVecs / 1000
+    return int(nlist)
+
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+n_rows, n_cols = xb.shape
+n_train, _ = xt.shape
+M = n_cols // args.pq_len
+nlist = compute_nlist(n_rows)
+index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+
+print("=" * 40)
+print("GPU Train Benchmarks")
+print("=" * 40)
+raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
+else:
+    classical_gpu_train_time = bench_train_milliseconds(
+        index, xt, False)
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+print("=" * 40)
+print("GPU Add Benchmarks")
+print("=" * 40)
+index.train(xt)
+raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
+else:
+    classical_gpu_add_time = bench_add_milliseconds(
+        index, xb, False)
+    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    co.useFloat16LookupTables = True
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [1, 10, 100, 1000, 10000]
+    n_train, n_cols = xt.shape
+    n_add, _ = xb.shape
+    print(xq.shape)
+    M = n_cols // args.pq_len
+    nlist = compute_nlist(n_add)
+    index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+    index.train(xt)
+    for n_rows in queryset_sizes:
+        queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
+        raft_gpu_search_time = bench_search_milliseconds(
+            index, xb, queryVecs, args.nprobe, args.k, True)
+        if args.raft_only:
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+        else:
+            classical_gpu_search_time = bench_search_milliseconds(
+                index, xb, queryVecs, args.nprobe, args.k, False)
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
\ No newline at end of file
diff --git a/benchs/link_and_code/README.md b/benchs/link_and_code/README.md
index bbf034bc60..0c04cadac5 100644
--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
@@ -21,136 +21,5 @@ graph to improve the reconstruction. It is described in
 
 ArXiV [here](https://arxiv.org/abs/1804.09996)
 
-Code structure
---------------
-
-The test runs with 3 files:
-
-- `bench_link_and_code.py`: driver script
-
-- `datasets.py`: code to load the datasets. The example code runs on the
-  deep1b and bigann datasets. See the [toplevel README](../README.md)
-  on how to download them. They should be put in a directory, edit
-  datasets.py to set the path.
-
-- `neighbor_codec.py`: this is where the representation is trained.
-
-The code runs on top of Faiss. The HNSW index can be extended with a
-`ReconstructFromNeighbors` C++ object that refines the distances. The
-training is implemented in Python.
-
-
-Reproducing Table 2 in the paper
---------------------------------
-
-The results of table 2 (accuracy on deep100M) in the paper can be
-obtained with:
-
-```bash
-python bench_link_and_code.py \
-   --db deep100M \
-   --M0 6 \
-   --indexkey OPQ36_144,HNSW32_PQ36 \
-   --indexfile $bdir/deep100M_PQ36_L6.index \
-   --beta_nsq 4  \
-   --beta_centroids $bdir/deep100M_PQ36_L6_nsq4.npy \
-   --neigh_recons_codes $bdir/deep100M_PQ36_L6_nsq4_codes.npy \
-   --k_reorder 0,5 --efSearch 1,1024
-```
-
-Set `bdir` to a scratch directory.
-
-Explanation of the flags:
-
-- `--db deep1M`: dataset to process
-
-- `--M0 6`: number of links on the base level (L6)
-
-- `--indexkey OPQ36_144,HNSW32_PQ36`: Faiss index key to construct the
-  HNSW structure. It means that vectors are transformed by OPQ and
-  encoded with PQ 36x8 (with an intermediate size of 144D). The HNSW
-  level>0 nodes have 32 links (theses ones are "cheap" to store
-  because there are fewer nodes in the upper levels.
-
-- `--indexfile $bdir/deep1M_PQ36_M6.index`: name of the index file
-  (without information for the L&C extension)
-
-- `--beta_nsq 4`: number of bytes to allocate for the codes (M in the
-  paper)
-
-- `--beta_centroids $bdir/deep1M_PQ36_M6_nsq4.npy`: filename to store
-  the trained beta centroids
-
-- `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
-  for the encoded weights (beta) of the combination
-
-- `--k_reorder 0,5`: number of results to reorder. 0 = baseline
-  without reordering, 5 = value used throughout the paper
-
-- `--efSearch 1,1024`: number of nodes to visit (T in the paper)
-
-The script will proceed with the following steps:
-
-0. load dataset (and possibly compute the ground-truth if the
-ground-truth file is not provided)
-
-1. train the OPQ encoder
-
-2. build the index and store it
-
-3. compute the residuals and train the beta vocabulary to do the reconstruction
-
-4. encode the vertices
-
-5. search and evaluate the search results.
-
-With option `--exhaustive` the results of the exhaustive column can be
-obtained.
-
-The run above should output:
-```bash
-...
-setting k_reorder=5
-...
-efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520 ndis 40941919 nreorder 50000
-
-```
-which matches the paper's table 2.
-
-Note that in multi-threaded mode, the building of the HNSW structure
-is not deterministic. Therefore, the results across runs may not be exactly the same.
-
-Reproducing Figure 5 in the paper
----------------------------------
-
-Figure 5 just evaluates the combination of HNSW and PQ. For example,
-the operating point L6&OPQ40 can be obtained with
-
-```bash
-python bench_link_and_code.py \
-   --db deep1M \
-   --M0 6 \
-   --indexkey OPQ40_160,HNSW32_PQ40 \
-   --indexfile $bdir/deep1M_PQ40_M6.index \
-   --beta_nsq 1 --beta_k 1  \
-   --beta_centroids $bdir/deep1M_PQ40_M6_nsq0.npy \
-   --neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq0_codes.npy \
-   --k_reorder 0 --efSearch 16,64,256,1024
-```
-
-The arguments are similar to the previous table. Note that nsq = 0 is
-simulated by setting beta_nsq = 1 and beta_k = 1 (ie a code with a single
-reproduction value).
-
-The output should look like:
-
-```bash
-setting k_reorder=0
-efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
-efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
-efSearch=256       0.0344 ms per query,  R@1: 0.5730 R@10: 0.7915 R@100: 0.7951 ndis 11090176 nreorder 0
-efSearch=1024      0.2656 ms per query,  R@1: 0.6212 R@10: 0.8722 R@100: 0.8765 ndis 33501951 nreorder 0
-```
-
-The results with k_reorder=5 are not reported in the paper, they
-represent the performance of a "free coding" version of the algorithm.
+The necessary code for this paper was removed from Faiss in version 1.8.0.
+For a functioning verinsion, use Faiss 1.7.4.
diff --git a/benchs/link_and_code/bench_link_and_code.py b/benchs/link_and_code/bench_link_and_code.py
deleted file mode 100755
index ed8f86d631..0000000000
--- a/benchs/link_and_code/bench_link_and_code.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import os
-import sys
-import time
-import numpy as np
-import faiss
-import argparse
-import datasets
-from datasets import sanitize
-import neighbor_codec
-
-######################################################
-# Command-line parsing
-######################################################
-
-
-parser = argparse.ArgumentParser()
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-group = parser.add_argument_group('dataset options')
-
-aa('--db', default='deep1M', help='dataset')
-aa( '--compute_gt', default=False, action='store_true',
-    help='compute and store the groundtruth')
-
-group = parser.add_argument_group('index consturction')
-
-aa('--indexkey', default='HNSW32', help='index_factory type')
-aa('--efConstruction', default=200, type=int,
-   help='HNSW construction factor')
-aa('--M0', default=-1, type=int, help='size of base level')
-aa('--maxtrain', default=256 * 256, type=int,
-   help='maximum number of training points')
-aa('--indexfile', default='', help='file to read or write index from')
-aa('--add_bs', default=-1, type=int,
-   help='add elements index by batches of this size')
-aa('--link_singletons', default=False, action='store_true',
-   help='do a pass to link in the singletons')
-
-group = parser.add_argument_group(
-    'searching (reconstruct_from_neighbors options)')
-
-aa('--beta_centroids', default='',
-   help='file with codebook')
-aa('--neigh_recons_codes', default='',
-   help='file with codes for reconstruction')
-aa('--beta_ntrain', default=250000, type=int, help='')
-aa('--beta_k', default=256, type=int, help='beta codebook size')
-aa('--beta_nsq', default=1, type=int, help='number of beta sub-vectors')
-aa('--beta_niter', default=10, type=int, help='')
-aa('--k_reorder', default='-1', help='')
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=100, type=int, help='nb of nearest neighbors')
-aa('--exhaustive', default=False, action='store_true',
-    help='report the exhaustive search topline')
-aa('--searchthreads', default=-1, type=int,
-   help='nb of threads to use at search time')
-aa('--efSearch', default='', type=str,
-   help='comma-separated values of efSearch to try')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-
-######################################################
-# Load dataset
-######################################################
-
-xt, xb, xq, gt = datasets.load_data(
-    dataset=args.db, compute_gt=args.compute_gt)
-
-nq, d = xq.shape
-nb, d = xb.shape
-
-
-######################################################
-# Make index
-######################################################
-
-if os.path.exists(args.indexfile):
-
-    print("reading", args.indexfile)
-    index = faiss.read_index(args.indexfile)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw_stats = faiss.cvar.hnsw_stats
-
-else:
-
-    print("build index, key=", args.indexkey)
-
-    index = faiss.index_factory(d, args.indexkey)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw.efConstruction = args.efConstruction
-    hnsw_stats = faiss.cvar.hnsw_stats
-    index.verbose = True
-    index_hnsw.verbose = True
-    index_hnsw.storage.verbose = True
-
-    if args.M0 != -1:
-        print("set level 0 nb of neighbors to", args.M0)
-        hnsw.set_nb_neighbors(0, args.M0)
-
-    xt2 = sanitize(xt[:args.maxtrain])
-    assert np.all(np.isfinite(xt2))
-
-    print("train, size", xt.shape)
-    t0 = time.time()
-    index.train(xt2)
-    print("  train in %.3f s" % (time.time() - t0))
-
-    print("adding")
-    t0 = time.time()
-    if args.add_bs == -1:
-        index.add(sanitize(xb))
-    else:
-        for i0 in range(0, nb, args.add_bs):
-            i1 = min(nb, i0 + args.add_bs)
-            print("  adding %d:%d / %d" % (i0, i1, nb))
-            index.add(sanitize(xb[i0:i1]))
-
-    print("  add in %.3f s" % (time.time() - t0))
-    print("storing", args.indexfile)
-    faiss.write_index(index, args.indexfile)
-
-
-######################################################
-# Train beta centroids and encode dataset
-######################################################
-
-if args.beta_centroids:
-    print("reordering links")
-    index_hnsw.reorder_links()
-
-    if os.path.exists(args.beta_centroids):
-        print("load", args.beta_centroids)
-        beta_centroids = np.load(args.beta_centroids)
-        nsq, k, M1 = beta_centroids.shape
-        assert M1 == hnsw.nb_neighbors(0) + 1
-
-        rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq)
-    else:
-        print("train beta centroids")
-        rfn = faiss.ReconstructFromNeighbors(
-            index_hnsw, args.beta_k, args.beta_nsq)
-
-        xb_full = vec_transform(sanitize(xb[:args.beta_ntrain]))
-
-        beta_centroids = neighbor_codec.train_beta_codebook(
-            rfn, xb_full, niter=args.beta_niter)
-
-        print("  storing", args.beta_centroids)
-        np.save(args.beta_centroids, beta_centroids)
-
-
-    faiss.copy_array_to_vector(beta_centroids.ravel(),
-                               rfn.codebook)
-    index_hnsw.reconstruct_from_neighbors = rfn
-
-    if rfn.k == 1:
-        pass     # no codes to take care of
-    elif os.path.exists(args.neigh_recons_codes):
-        print("loading neigh codes", args.neigh_recons_codes)
-        codes = np.load(args.neigh_recons_codes)
-        assert codes.size == rfn.code_size * index.ntotal
-        faiss.copy_array_to_vector(codes.astype('uint8'),
-                                   rfn.codes)
-        rfn.ntotal = index.ntotal
-    else:
-        print("encoding neigh codes")
-        t0 = time.time()
-
-        bs = 1000000 if args.add_bs == -1 else args.add_bs
-
-        for i0 in range(0, nb, bs):
-            i1 = min(i0 + bs, nb)
-            print("   encode %d:%d / %d [%.3f s]\r" % (
-                i0, i1, nb, time.time() - t0), end=' ')
-            sys.stdout.flush()
-            xbatch = vec_transform(sanitize(xb[i0:i1]))
-            rfn.add_codes(i1 - i0, faiss.swig_ptr(xbatch))
-        print()
-
-        print("storing %s" % args.neigh_recons_codes)
-        codes = faiss.vector_to_array(rfn.codes)
-        np.save(args.neigh_recons_codes, codes)
-
-######################################################
-# Exhaustive evaluation
-######################################################
-
-if args.exhaustive:
-    print("exhaustive evaluation")
-    xq_tr = vec_transform(sanitize(xq))
-    index2 = faiss.IndexFlatL2(index_hnsw.d)
-    accu_recons_error = 0.0
-
-    if faiss.get_num_gpus() > 0:
-        print("do eval on GPU")
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = False
-        index2 = faiss.index_cpu_to_all_gpus(index2, co)
-
-    # process in batches in case the dataset does not fit in RAM
-    rh = datasets.ResultHeap(xq_tr.shape[0], 100)
-    t0 = time.time()
-    bs = 500000
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        print('  handling batch %d:%d' % (i0, i1))
-
-        xb_recons = np.empty(
-            (i1 - i0, index_hnsw.d), dtype='float32')
-        rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons))
-
-        accu_recons_error += (
-            (vec_transform(sanitize(xb[i0:i1])) -
-             xb_recons)**2).sum()
-
-        index2.reset()
-        index2.add(xb_recons)
-        D, I = index2.search(xq_tr, 100)
-        rh.add_batch_result(D, I, i0)
-
-    rh.finalize()
-    del index2
-    t1 = time.time()
-    print("done in %.3f s" % (t1 - t0))
-    print("total reconstruction error: ", accu_recons_error)
-    print("eval retrieval:")
-    datasets.evaluate_DI(rh.D, rh.I, gt)
-
-
-def get_neighbors(hnsw, i, level):
-    " list the neighbors for node i at level "
-    assert i < hnsw.levels.size()
-    assert level < hnsw.levels.at(i)
-    be = np.empty(2, 'uint64')
-    hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
-    return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
-
-
-#############################################################
-# Index is ready
-#############################################################
-
-xq = sanitize(xq)
-
-if args.searchthreads != -1:
-    print("Setting nb of threads to", args.searchthreads)
-    faiss.omp_set_num_threads(args.searchthreads)
-
-
-if gt is None:
-    print("no valid groundtruth -- exit")
-    sys.exit()
-
-
-k_reorders = [int(x) for x in args.k_reorder.split(',')]
-efSearchs = [int(x) for x in args.efSearch.split(',')]
-
-
-for k_reorder in k_reorders:
-
-    if index_hnsw.reconstruct_from_neighbors:
-        print("setting k_reorder=%d" % k_reorder)
-        index_hnsw.reconstruct_from_neighbors.k_reorder = k_reorder
-
-    for efSearch in efSearchs:
-        print("efSearch=%-4d" % efSearch, end=' ')
-        hnsw.efSearch = efSearch
-        hnsw_stats.reset()
-        datasets.evaluate(xq, gt, index, k=args.k, endl=False)
-
-        print("ndis %d nreorder %d" % (hnsw_stats.ndis, hnsw_stats.nreorder))
diff --git a/benchs/link_and_code/datasets.py b/benchs/link_and_code/datasets.py
deleted file mode 100755
index a043eb8883..0000000000
--- a/benchs/link_and_code/datasets.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#! /usr/bin/env python2
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Common functions to load datasets and compute their ground-truth
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-import pdb
-import sys
-
-# set this to the directory that contains the datafiles.
-# deep1b data should be at simdir + 'deep1b'
-# bigann data should be at simdir + 'bigann'
-simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-def bvecs_mmap(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def ivecs_write(fname, m):
-    n, d = m.shape
-    m1 = np.empty((n, d + 1), dtype='int32')
-    m1[:, 0] = d
-    m1[:, 1:] = m
-    m1.tofile(fname)
-
-
-def fvecs_write(fname, m):
-    m = m.astype('float32')
-    ivecs_write(fname, m.view('int32'))
-
-
-#################################################################
-# Dataset
-#################################################################
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-class ResultHeap:
-    """ Combine query results from a sliced dataset """
-
-    def __init__(self, nq, k):
-        " nq: number of query vectors, k: number of results per query "
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        heaps = faiss.float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = faiss.swig_ptr(self.D)
-        heaps.ids = faiss.swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_batch_result(self, D, I, i0):
-        assert D.shape == (self.nq, self.k)
-        assert I.shape == (self.nq, self.k)
-        I += i0
-        self.heaps.addn_with_ids(
-            self.k, faiss.swig_ptr(D),
-            faiss.swig_ptr(I), self.k)
-
-    def finalize(self):
-        self.heaps.reorder()
-
-
-
-def compute_GT_sliced(xb, xq, k):
-    print("compute GT")
-    t0 = time.time()
-    nb, d = xb.shape
-    nq, d = xq.shape
-    rh = ResultHeap(nq, k)
-    bs = 10 ** 5
-
-    xqs = sanitize(xq)
-
-    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-
-    # compute ground-truth by blocks of bs, and add to heaps
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        xsl = sanitize(xb[i0:i1])
-        db_gt.add(xsl)
-        D, I = db_gt.search(xqs, k)
-        rh.add_batch_result(D, I, i0)
-        db_gt.reset()
-        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
-        sys.stdout.flush()
-    print()
-    rh.finalize()
-    gt_I = rh.I
-
-    print("GT time: %.3f s" % (time.time() - t0))
-    return gt_I
-
-
-def do_compute_gt(xb, xq, k):
-    print("computing GT")
-    nb, d = xb.shape
-    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-    if nb < 100 * 1000:
-        print("   add")
-        index.add(np.ascontiguousarray(xb, dtype='float32'))
-        print("   search")
-        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
-    else:
-        I = compute_GT_sliced(xb, xq, k)
-
-    return I.astype('int32')
-
-
-def load_data(dataset='deep1M', compute_gt=False):
-
-    print("load data", dataset)
-
-    if dataset == 'sift1M':
-        basedir = simdir + 'sift1M/'
-
-        xt = fvecs_read(basedir + "sift_learn.fvecs")
-        xb = fvecs_read(basedir + "sift_base.fvecs")
-        xq = fvecs_read(basedir + "sift_query.fvecs")
-        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
-
-    elif dataset.startswith('bigann'):
-        basedir = simdir + 'bigann/'
-
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
-        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
-        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
-        # trim xb to correct size
-        xb = xb[:dbsize * 1000 * 1000]
-        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
-
-    elif dataset.startswith("deep"):
-        basedir = simdir + 'deep1b/'
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-
-        xt = fvecs_mmap(basedir + "learn.fvecs")
-        xb = fvecs_mmap(basedir + "base.fvecs")
-        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
-
-        xb = xb[:dbsize]
-
-        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
-        if compute_gt:
-            gt = do_compute_gt(xb, xq, 100)
-            print("store", gt_fname)
-            ivecs_write(gt_fname, gt)
-
-        gt = ivecs_read(gt_fname)
-
-    else:
-        assert False
-
-    print("dataset %s sizes: B %s Q %s T %s" % (
-        dataset, xb.shape, xq.shape, xt.shape))
-
-    return xt, xb, xq, gt
-
-#################################################################
-# Evaluation
-#################################################################
-
-
-def evaluate_DI(D, I, gt):
-    nq = gt.shape[0]
-    k = I.shape[1]
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-
-
-def evaluate(xq, gt, index, k=100, endl=True):
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-    nq = xq.shape[0]
-    print("\t %8.4f ms per query, " % (
-        (t1 - t0) * 1000.0 / nq), end=' ')
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-    if endl:
-        print()
-    return D, I
diff --git a/benchs/link_and_code/neighbor_codec.py b/benchs/link_and_code/neighbor_codec.py
deleted file mode 100755
index 54cad8168a..0000000000
--- a/benchs/link_and_code/neighbor_codec.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This is the training code for the link and code. Especially the
-neighbors_kmeans function implements the EM-algorithm to find the
-appropriate weightings and cluster them.
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-
-#----------------------------------------------------------
-# Utils
-#----------------------------------------------------------
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
-    "Runs kmeans on one or several GPUs"
-    d = x.shape[1]
-    clus = faiss.Clustering(d, k)
-    clus.verbose = True
-    clus.niter = 20
-    clus.max_points_per_centroid = max_points_per_centroid
-
-    if ngpu == 0:
-        index = faiss.IndexFlatL2(d)
-    else:
-        res = [faiss.StandardGpuResources() for i in range(ngpu)]
-
-        flat_config = []
-        for i in range(ngpu):
-            cfg = faiss.GpuIndexFlatConfig()
-            cfg.useFloat16 = False
-            cfg.device = i
-            flat_config.append(cfg)
-
-        if ngpu == 1:
-            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
-        else:
-            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
-                       for i in range(ngpu)]
-            index = faiss.IndexReplicas()
-            for sub_index in indexes:
-                index.addIndex(sub_index)
-
-    # perform the training
-    clus.train(x, index)
-    centroids = faiss.vector_float_to_array(clus.centroids)
-
-    stats = clus.iteration_stats
-    stats = [stats.at(i) for i in range(stats.size())]
-    obj = np.array([st.obj for st in stats])
-    print("final objective: %.4g" % obj[-1])
-
-    return centroids.reshape(k, d)
-
-
-#----------------------------------------------------------
-# Learning the codebook from neighbors
-#----------------------------------------------------------
-
-
-# works with both a full Inn table and dynamically generated neighbors
-
-def get_Inn_shape(Inn):
-    if type(Inn) != tuple:
-        return Inn.shape
-    return Inn[:2]
-
-def get_neighbor_table(x_coded, Inn, i):
-    if type(Inn) != tuple:
-        return x_coded[Inn[i,:],:]
-    rfn = x_coded
-    M, d = rfn.M, rfn.index.d
-    out = np.zeros((M + 1, d), dtype='float32')
-    int_i = int(i)
-    rfn.get_neighbor_table(int_i, faiss.swig_ptr(out))
-    _, _, sq = Inn
-    return out[:, sq * rfn.dsub : (sq + 1) * rfn.dsub]
-
-
-# Function that produces the best regression values from the vector
-# and its neighbors
-def regress_from_neighbors (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    betas = np.zeros((N,knn))
-    t0 = time.time()
-    for i in range (N):
-        xi = x[i,:]
-        NNi = get_neighbor_table(x_coded, Inn, i)
-        betas[i,:] = np.linalg.lstsq(NNi.transpose(), xi, rcond=0.01)[0]
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return betas
-
-
-
-# find the best beta minimizing ||x-x_coded[Inn,:]*beta||^2
-def regress_opt_beta (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    # construct the linear system to be solved
-    X = np.zeros ((d*N))
-    Y = np.zeros ((d*N, knn))
-    for i in range (N):
-        X[i*d:(i+1)*d] = x[i,:]
-        neighbor_table = get_neighbor_table(x_coded, Inn, i)
-        Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-    beta_opt = np.linalg.lstsq(Y, X, rcond=0.01)[0]
-    return beta_opt
-
-
-# Find the best encoding by minimizing the reconstruction error using
-# a set of pre-computed beta values
-def assign_beta (beta_centroids, x, x_coded, Inn, verbose=True):
-    if type(Inn) == tuple:
-        return assign_beta_2(beta_centroids, x, x_coded, Inn)
-    (N, knn) = Inn.shape
-    x_ibeta = np.zeros ((N), dtype='int32')
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        # Consider all possible betas for the encoding and compute the
-        # encoding error
-        x_reg_all = np.dot (beta_centroids, NNi)
-        err = ((x_reg_all - x[i,:]) ** 2).sum(axis=1)
-        x_ibeta[i] = err.argmin()
-        if verbose:
-            if i % (N / 10) == 0:
-                print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_ibeta
-
-
-# Reconstruct a set of vectors using the beta_centroids, the
-# assignment, the encoded neighbors identified by the list Inn (which
-# includes the vector itself)
-def recons_from_neighbors (beta_centroids, x_ibeta, x_coded, Inn):
-    (N, knn) = Inn.shape
-    x_rec = np.zeros(x_coded.shape)
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        x_rec[i, :] = np.dot (beta_centroids[x_ibeta[i]], NNi)
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_rec
-
-
-# Compute a EM-like algorithm trying at optimizing the beta such as they
-# minimize the reconstruction error from the neighbors
-def neighbors_kmeans (x, x_coded, Inn, K, ngpus=1, niter=5):
-    # First compute centroids using a regular k-means algorithm
-    betas = regress_from_neighbors (x, x_coded, Inn)
-    beta_centroids = train_kmeans(
-        sanitize(betas), K, ngpus, max_points_per_centroid=1000000)
-    _, knn = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    rs = np.random.RandomState()
-    for iter in range(niter):
-        print('iter', iter)
-        idx = assign_beta (beta_centroids, x, x_coded, Inn, verbose=False)
-
-        hist = np.bincount(idx)
-        for cl0 in np.where(hist == 0)[0]:
-            print("  cluster %d empty, split" % cl0, end=' ')
-            cl1 = idx[np.random.randint(idx.size)]
-            pos = np.nonzero (idx == cl1)[0]
-            pos = rs.choice(pos, pos.size / 2)
-            print("   cl %d -> %d + %d" % (cl1, len(pos), hist[cl1] - len(pos)))
-            idx[pos] = cl0
-            hist = np.bincount(idx)
-
-        tot_err = 0
-        for k in range (K):
-            pos = np.nonzero (idx == k)[0]
-            npos = pos.shape[0]
-
-            X = np.zeros (d*npos)
-            Y = np.zeros ((d*npos, knn))
-
-            for i in range(npos):
-                X[i*d:(i+1)*d] = x[pos[i],:]
-                neighbor_table = get_neighbor_table(x_coded, Inn, pos[i])
-                Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-            sol, residuals, _, _ = np.linalg.lstsq(Y, X, rcond=0.01)
-            if residuals.size > 0:
-                tot_err += residuals.sum()
-            beta_centroids[k, :] = sol
-        print('  err=%g' % tot_err)
-    return beta_centroids
-
-
-# assign the betas in C++
-def assign_beta_2(beta_centroids, x, rfn, Inn):
-    _, _, sq = Inn
-    if rfn.k == 1:
-        return np.zeros(x.shape[0], dtype=int)
-    # add dummy dimensions to beta_centroids and x
-    all_beta_centroids = np.zeros(
-        (rfn.nsq, rfn.k, rfn.M + 1), dtype='float32')
-    all_beta_centroids[sq] = beta_centroids
-    all_x = np.zeros((len(x), rfn.d), dtype='float32')
-    all_x[:, sq * rfn.dsub : (sq + 1) * rfn.dsub] = x
-    rfn.codes.clear()
-    rfn.ntotal = 0
-    faiss.copy_array_to_vector(
-        all_beta_centroids.ravel(), rfn.codebook)
-    rfn.add_codes(len(x), faiss.swig_ptr(all_x))
-    codes = faiss.vector_to_array(rfn.codes)
-    codes = codes.reshape(-1, rfn.nsq)
-    return codes[:, sq]
-
-
-#######################################################
-# For usage from bench_storages.py
-
-def train_beta_codebook(rfn, xb_full, niter=10):
-    beta_centroids = []
-    for sq in range(rfn.nsq):
-        d0, d1 = sq * rfn.dsub, (sq + 1) * rfn.dsub
-        print("training subquantizer %d/%d on dimensions %d:%d" % (
-            sq, rfn.nsq, d0, d1))
-        beta_centroids_i = neighbors_kmeans(
-            xb_full[:, d0:d1], rfn, (xb_full.shape[0], rfn.M + 1, sq),
-            rfn.k,
-            ngpus=0, niter=niter)
-        beta_centroids.append(beta_centroids_i)
-        rfn.ntotal = 0
-        rfn.codes.clear()
-        rfn.codebook.clear()
-    return np.stack(beta_centroids)
diff --git a/c_api/IndexScalarQuantizer_c.h b/c_api/IndexScalarQuantizer_c.h
index 2c5e3f2942..87fe6d3415 100644
--- a/c_api/IndexScalarQuantizer_c.h
+++ b/c_api/IndexScalarQuantizer_c.h
@@ -26,6 +26,7 @@ typedef enum FaissQuantizerType {
     QT_fp16,
     QT_8bit_direct, ///< fast indexing of uint8s
     QT_6bit,        ///< 6 bits per component
+    QT_bf16,
 } FaissQuantizerType;
 
 // forward declaration
diff --git a/c_api/clone_index_c.cpp b/c_api/clone_index_c.cpp
index 8211156aaa..606e5f9b0a 100644
--- a/c_api/clone_index_c.cpp
+++ b/c_api/clone_index_c.cpp
@@ -14,6 +14,7 @@
 #include "macros_impl.h"
 
 using faiss::Index;
+using faiss::IndexBinary;
 
 int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     try {
@@ -22,3 +23,14 @@ int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     }
     CATCH_AND_HANDLE
 }
+
+int faiss_clone_index_binary(
+        const FaissIndexBinary* idx,
+        FaissIndexBinary** p_out) {
+    try {
+        auto out = faiss::clone_binary_index(
+                reinterpret_cast<const IndexBinary*>(idx));
+        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/clone_index_c.h b/c_api/clone_index_c.h
index 3d0bd6745f..d2da35b82f 100644
--- a/c_api/clone_index_c.h
+++ b/c_api/clone_index_c.h
@@ -13,6 +13,7 @@
 #define FAISS_CLONE_INDEX_C_H
 
 #include <stdio.h>
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -25,6 +26,9 @@ extern "C" {
 /** Clone an index. This is equivalent to `faiss::clone_index` */
 int faiss_clone_index(const FaissIndex*, FaissIndex** p_out);
 
+/** Clone a binary index. This is equivalent to `faiss::clone_index_binary` */
+int faiss_clone_index_binary(const FaissIndexBinary*, FaissIndexBinary** p_out);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c_api/index_factory_c.cpp b/c_api/index_factory_c.cpp
index e9abf141f8..3a1ab9bab9 100644
--- a/c_api/index_factory_c.cpp
+++ b/c_api/index_factory_c.cpp
@@ -15,7 +15,7 @@
 
 using faiss::Index;
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -29,3 +29,17 @@ int faiss_index_factory(
     }
     CATCH_AND_HANDLE
 }
+
+/** Build an index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexBinary*>(
+                faiss::index_binary_factory(d, description));
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/index_factory_c.h b/c_api/index_factory_c.h
index 11fb0faa16..ccd58ac778 100644
--- a/c_api/index_factory_c.h
+++ b/c_api/index_factory_c.h
@@ -11,6 +11,7 @@
 #ifndef FAISS_INDEX_FACTORY_C_H
 #define FAISS_INDEX_FACTORY_C_H
 
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -18,7 +19,7 @@
 extern "C" {
 #endif
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -27,6 +28,14 @@ int faiss_index_factory(
         const char* description,
         FaissMetricType metric);
 
+/** Build a binary index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 32ec15b6a4..3e0f6b6ac4 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "23.12")
+set(RAPIDS_VERSION "24.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml
index 77f0eec0a2..4df05146df 100644
--- a/conda/conda_build_config.yaml
+++ b/conda/conda_build_config.yaml
@@ -2,3 +2,4 @@ python:
   - 3.9
   - 3.10
   - 3.11
+  - 3.12  # [not aarch64]
diff --git a/conda/faiss-gpu-raft/build-lib.sh b/conda/faiss-gpu-raft/build-lib.sh
index 7ca17180a4..79ca8da2cd 100644
--- a/conda/faiss-gpu-raft/build-lib.sh
+++ b/conda/faiss-gpu-raft/build-lib.sh
@@ -7,11 +7,11 @@
 set -e
 
 
-# Build libfaiss.so/libfaiss_avx2.so.
+# Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
       -DBUILD_TESTING=OFF \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
       -DFAISS_ENABLE_RAFT=ON \
       -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
@@ -20,7 +20,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss-gpu-raft/build-pkg.sh b/conda/faiss-gpu-raft/build-pkg.sh
index 3bb61588e5..da5fdefca9 100644
--- a/conda/faiss-gpu-raft/build-pkg.sh
+++ b/conda/faiss-gpu-raft/build-pkg.sh
@@ -7,17 +7,17 @@
 set -e
 
 
-# Build swigfaiss.so/swigfaiss_avx2.so.
+# Build swigfaiss.so/swigfaiss_avx2.so/swigfaiss_avx512.so
 cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
       -DFAISS_ENABLE_RAFT=ON \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2 swigfaiss_avx512
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 387f8b4ac0..1dde8e9868 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -6,6 +6,13 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
 {% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
 {% set number = GIT_DESCRIBE_NUMBER %}
+{% if cudatoolkit == '11.8.0' %}
+{% set cuda_constraints=">=11.8,<12" %}
+{% set libcublas_constraints=">=11.11,<12" %}
+{% elif cudatoolkit == '12.1.1' %}
+{% set cuda_constraints=">=12.1,<13" %}
+{% set libcublas_constraints=">=12.1,<13" %}
+{% endif %}
 
 package:
   name: faiss-pkg
@@ -41,19 +48,26 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - mkl-devel =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - cuda-version {{ cudatoolkit }}
-        - libraft =23.12
+        - libraft =24.06
+        - cuda-version {{ cuda_constraints }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - {{ pin_compatible('cuda-version', max_pin='x') }}
-        - libraft =23.12
+        - cuda-cudart {{ cuda_constraints }}
+        - libcublas {{ libcublas_constraints }}
+        - libraft =24.06
+        - cuda-version {{ cuda_constraints }}
     test:
       requires:
         - conda-build
@@ -75,23 +89,28 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
+        - packaging
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
         - numpy
         - scipy
         - pytorch
-        - pytorch-cuda =11.8
-        - cuda-version =11.8
+        - pytorch-cuda {{ cuda_constraints }}
       commands:
         - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
         - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
diff --git a/conda/faiss-gpu-raft/test_cpu_dispatch.sh b/conda/faiss-gpu-raft/test_cpu_dispatch.sh
index b2891919d5..a7c1b2da72 100755
--- a/conda/faiss-gpu-raft/test_cpu_dispatch.sh
+++ b/conda/faiss-gpu-raft/test_cpu_dispatch.sh
@@ -6,5 +6,6 @@
 
 set -e
 
-FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
-LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL= LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
+FAISS_OPT_LEVEL=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL=AVX512 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx512.so
diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index 6b6b1c28d0..9957be96ea 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -6,12 +6,18 @@
 
 set -e
 
+# Workaround for CUDA 11.4.4 builds. Moves all necessary headers to include root.
+if [ -n "$FAISS_FLATTEN_CONDA_INCLUDES" ] && [ "$FAISS_FLATTEN_CONDA_INCLUDES" = "1" ]; then
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/"* "$CONDA_PREFIX/include/"
+fi
 
-# Build libfaiss.so/libfaiss_avx2.so.
+# Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
       -DBUILD_TESTING=OFF \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
       -DFAISS_ENABLE_RAFT=OFF \
       -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
@@ -20,7 +26,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss-gpu/build-pkg.sh b/conda/faiss-gpu/build-pkg.sh
index 3a41511921..e529a83d80 100755
--- a/conda/faiss-gpu/build-pkg.sh
+++ b/conda/faiss-gpu/build-pkg.sh
@@ -7,17 +7,17 @@
 set -e
 
 
-# Build swigfaiss.so/swigfaiss_avx2.so.
+# Build swigfaiss.so/swigfaiss_avx2.so/swigfaiss_avx512.so
 cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
       -DFAISS_ENABLE_RAFT=OFF \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2 swigfaiss_avx512
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index e7b839975f..05f7b59008 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -6,6 +6,13 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
 {% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
 {% set number = GIT_DESCRIBE_NUMBER %}
+{% if cudatoolkit == '11.4.4' %}
+{% set cuda_constraints=">=11.4,<12" %}
+{% set libcublas_constraints=">=11.6,<12" %}
+{% elif cudatoolkit == '12.1.1' %}
+{% set cuda_constraints=">=12.1,<13" %}
+{% set libcublas_constraints=">=12.1,<13" %}
+{% endif %}
 
 package:
   name: faiss-pkg
@@ -36,22 +43,24 @@ outputs:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
         - CUDA_ARCHS
+        - FAISS_FLATTEN_CONDA_INCLUDES
     requirements:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - cudatoolkit {{ cudatoolkit }}
       run:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - {{ pin_compatible('cudatoolkit', max_pin='x') }}
+        - cuda-cudart {{ cuda_constraints }}
+        - libcublas {{ libcublas_constraints }}
     test:
       requires:
         - conda-build
@@ -73,8 +82,9 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - python {{ python }}
         - numpy >=1.19,<2
@@ -82,14 +92,14 @@ outputs:
       run:
         - python {{ python }}
         - numpy >=1.19,<2
+        - packaging
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
         - numpy
         - scipy
         - pytorch
-        - pytorch-cuda =11.8
-        - cudatoolkit =11.8
+        - pytorch-cuda {{ cuda_constraints }}
       commands:
         - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
         - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
diff --git a/conda/faiss-gpu/test_cpu_dispatch.sh b/conda/faiss-gpu/test_cpu_dispatch.sh
index b2891919d5..a7c1b2da72 100755
--- a/conda/faiss-gpu/test_cpu_dispatch.sh
+++ b/conda/faiss-gpu/test_cpu_dispatch.sh
@@ -6,5 +6,6 @@
 
 set -e
 
-FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
-LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL= LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
+FAISS_OPT_LEVEL=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL=AVX512 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx512.so
diff --git a/conda/faiss/build-lib-osx.sh b/conda/faiss/build-lib-osx.sh
index a30de2d000..e858106bf2 100755
--- a/conda/faiss/build-lib-osx.sh
+++ b/conda/faiss/build-lib-osx.sh
@@ -7,11 +7,11 @@
 set -e
 
 
-# Build libfaiss.so/libfaiss_avx2.so.
+# Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
       -DBUILD_TESTING=OFF \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DBLA_VENDOR=Intel10_64lp \
@@ -21,7 +21,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss/build-lib.sh b/conda/faiss/build-lib.sh
index 8aed84ba41..5028891d93 100755
--- a/conda/faiss/build-lib.sh
+++ b/conda/faiss/build-lib.sh
@@ -7,18 +7,18 @@
 set -e
 
 
-# Build libfaiss.so/libfaiss_avx2.so.
+# Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
       -DBUILD_SHARED_LIBS=ON \
       -DBUILD_TESTING=OFF \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DBLA_VENDOR=Intel10_64lp \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j$(nproc) faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2 faiss_avx512
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/conda/faiss/build-pkg-arm64.sh b/conda/faiss/build-pkg-arm64.sh
index c63380ab01..70fc7312e5 100755
--- a/conda/faiss/build-pkg-arm64.sh
+++ b/conda/faiss/build-pkg-arm64.sh
@@ -7,7 +7,7 @@
 set -e
 
 
-# Build swigfaiss.so/swigfaiss_avx2.so.
+# Build swigfaiss.so
 cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
       -DFAISS_ENABLE_GPU=OFF \
diff --git a/conda/faiss/build-pkg-osx.sh b/conda/faiss/build-pkg-osx.sh
index 15016face9..914aed174d 100755
--- a/conda/faiss/build-pkg-osx.sh
+++ b/conda/faiss/build-pkg-osx.sh
@@ -7,10 +7,10 @@
 set -e
 
 
-# Build swigfaiss.so/swigfaiss_avx2.so.
+# Build swigfaiss.so/swigfaiss_avx2.so/swigfaiss_avx512
 cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
       -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
       -DOpenMP_CXX_LIB_NAMES=libiomp5 \
@@ -19,7 +19,7 @@ cmake -B _build_python_${PY_VER} \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2 swigfaiss_avx512
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/conda/faiss/build-pkg.sh b/conda/faiss/build-pkg.sh
index 005aec2fcc..92e0febfa8 100755
--- a/conda/faiss/build-pkg.sh
+++ b/conda/faiss/build-pkg.sh
@@ -7,16 +7,16 @@
 set -e
 
 
-# Build swigfaiss.so/swigfaiss_avx2.so.
+# Build swigfaiss.so/swigfaiss_avx2.so/swigfaiss_avx512.so
 cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
-      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=OFF \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2 swigfaiss_avx512
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/conda/faiss/meta.yaml b/conda/faiss/meta.yaml
index a0431a4041..79e7be953e 100644
--- a/conda/faiss/meta.yaml
+++ b/conda/faiss/meta.yaml
@@ -39,7 +39,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
       host:
@@ -69,7 +69,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}
@@ -78,6 +78,7 @@ outputs:
       run:
         - python {{ python }}
         - numpy >=1.19,<2
+        - packaging
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
diff --git a/conda/faiss/test_cpu_dispatch.sh b/conda/faiss/test_cpu_dispatch.sh
index b2891919d5..a7c1b2da72 100755
--- a/conda/faiss/test_cpu_dispatch.sh
+++ b/conda/faiss/test_cpu_dispatch.sh
@@ -6,5 +6,6 @@
 
 set -e
 
-FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
-LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL= LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
+FAISS_OPT_LEVEL=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
+FAISS_OPT_LEVEL=AVX512 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx512.so
diff --git a/contrib/big_batch_search.py b/contrib/big_batch_search.py
index 6b0fd36e91..440a538c15 100644
--- a/contrib/big_batch_search.py
+++ b/contrib/big_batch_search.py
@@ -6,6 +6,7 @@
 import time
 import pickle
 import os
+import logging
 from multiprocessing.pool import ThreadPool
 import threading
 import _thread
@@ -41,7 +42,7 @@ def __init__(
         self.use_float16 = use_float16
         keep_max = faiss.is_similarity_metric(index.metric_type)
         self.rh = faiss.ResultHeap(len(xq), k, keep_max=keep_max)
-        self.t_accu = [0] * 5
+        self.t_accu = [0] * 6
         self.t_display = self.t0 = time.time()
 
     def start_t_accu(self):
@@ -74,11 +75,12 @@ def report(self, l):
             f"[{t:.1f} s] list {l}/{self.index.nlist} "
             f"times prep q {self.t_accu[0]:.3f} prep b {self.t_accu[1]:.3f} "
             f"comp {self.t_accu[2]:.3f} res {self.t_accu[3]:.3f} "
-            f"wait {self.t_accu[4]:.3f} "
+            f"wait in {self.t_accu[4]:.3f} "
+            f"wait out {self.t_accu[5]:.3f} "
             f"eta {datetime.timedelta(seconds=t*self.index.nlist/(l+1)-t)} "
             f"mem {faiss.get_mem_usage_kb()}",
-            end="\r" if self.verbose <= 2 else "\n",
-            flush=True,
+             end="\r" if self.verbose <= 2 else "\n",
+             flush=True,
         )
         self.t_display = time.time()
 
@@ -293,7 +295,7 @@ def big_batch_search(
     )
     mem_tot = mem_queries + mem_assign + mem_res
     if verbose > 0:
-        print(
+        logging.info(
             f"memory: queries {mem_queries} assign {mem_assign} "
             f"result {mem_res} total {mem_tot} = {mem_tot / (1<<30):.3f} GiB"
         )
@@ -312,8 +314,8 @@ def big_batch_search(
     )
 
     bbs.decode_func = comp.decode_func
-    bbs.by_residual = comp.by_residual
 
+    bbs.by_residual = comp.by_residual
     if q_assign is None:
         bbs.coarse_quantization()
     else:
@@ -327,11 +329,11 @@ def big_batch_search(
     if checkpoint is not None:
         assert (start_list, end_list) == (0, index.nlist)
         if os.path.exists(checkpoint):
-            print("recovering checkpoint", checkpoint)
+            logging.info(f"recovering checkpoint: {checkpoint}")
             completed = bbs.read_checkpoint(checkpoint)
-            print("   already completed", len(completed))
+            logging.info(f"   already completed: {len(completed)}")
         else:
-            print("no checkpoint: starting from scratch")
+            logging.info("no checkpoint: starting from scratch")
 
     if threaded == 0:
         # simple sequential version
@@ -414,10 +416,10 @@ def task_manager(*args):
 
         def prepare_task(task_id, output_queue, input_queue=None):
             try:
-                # print(f"Prepare start: {task_id}")
+                logging.info(f"Prepare start: {task_id}")
                 q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(task_id)
                 output_queue.put((task_id, q_subset, xq_l, list_ids, xb_l))
-                # print(f"Prepare end: {task_id}")
+                logging.info(f"Prepare end: {task_id}")
             except:
                 traceback.print_exc()
                 _thread.interrupt_main()
@@ -425,18 +427,19 @@ def prepare_task(task_id, output_queue, input_queue=None):
 
         def compute_task(task_id, output_queue, input_queue):
             try:
-                # print(f"Compute start: {task_id}")
-                t_wait = 0
+                logging.info(f"Compute start: {task_id}")
+                t_wait_out = 0
                 while True:
                     t0 = time.time()
+                    logging.info(f'Compute input: task {task_id}')
                     input_value = input_queue.get()
-                    t_wait += time.time() - t0
+                    t_wait_in = time.time() - t0
                     if input_value is None:
                         # signal for other compute tasks
                         input_queue.put(None)
                         break
                     centroid, q_subset, xq_l, list_ids, xb_l = input_value
-                    # print(f'Compute work start: task {task_id}, centroid {centroid}')
+                    logging.info(f'Compute work: task {task_id}, centroid {centroid}')
                     t0 = time.time()
                     if computation_threads > 1:
                         D, I = comp.block_search(
@@ -445,13 +448,13 @@ def compute_task(task_id, output_queue, input_queue):
                     else:
                         D, I = comp.block_search(xq_l, xb_l, list_ids, k)
                     t_compute = time.time() - t0
-                    # print(f'Compute work end: task {task_id}, centroid {centroid}')
+                    logging.info(f'Compute output: task {task_id}, centroid {centroid}')
                     t0 = time.time()
                     output_queue.put(
-                        (centroid, t_wait, t_compute, q_subset, D, list_ids, I)
+                        (centroid, t_wait_in, t_wait_out, t_compute, q_subset, D, list_ids, I)
                     )
-                    t_wait = time.time() - t0
-                # print(f"Compute end: {task_id}")
+                    t_wait_out = time.time() - t0
+                logging.info(f"Compute end: {task_id}")
             except:
                 traceback.print_exc()
                 _thread.interrupt_main()
@@ -480,21 +483,25 @@ def compute_task(task_id, output_queue, input_queue):
 
         t_checkpoint = time.time()
         while True:
+            logging.info("Waiting for result")
             value = compute_to_main_queue.get()
             if not value:
                 break
-            centroid, t_wait, t_compute, q_subset, D, list_ids, I = value
+            centroid, t_wait_in, t_wait_out, t_compute, q_subset, D, list_ids, I = value
             # to test checkpointing
             if centroid == crash_at:
                 1 / 0
             bbs.t_accu[2] += t_compute
-            bbs.t_accu[4] += t_wait
+            bbs.t_accu[4] += t_wait_in
+            bbs.t_accu[5] += t_wait_out
+            logging.info(f"Adding to heap start: centroid {centroid}")
             bbs.add_results_to_heap(q_subset, D, list_ids, I)
+            logging.info(f"Adding to heap end: centroid {centroid}")
             completed.add(centroid)
             bbs.report(centroid)
             if checkpoint is not None:
                 if time.time() - t_checkpoint > checkpoint_freq:
-                    print("writing checkpoint")
+                    logging.info("writing checkpoint")
                     bbs.write_checkpoint(checkpoint, completed)
                     t_checkpoint = time.time()
 
diff --git a/contrib/datasets.py b/contrib/datasets.py
index f37a2fb6e4..281f16e2fa 100644
--- a/contrib/datasets.py
+++ b/contrib/datasets.py
@@ -6,6 +6,8 @@
 import os
 import numpy as np
 import faiss
+import getpass
+
 
 from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
 from .exhaustive_search import knn
@@ -115,10 +117,12 @@ def get_groundtruth(self, k=100):
 # that directory is
 ############################################################################
 
+username = getpass.getuser()
 
 for dataset_basedir in (
         '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/'):
+        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
+        f'/home/{username}/simsearch/data/'):
     if os.path.exists(dataset_basedir):
         break
 else:
diff --git a/contrib/evaluation.py b/contrib/evaluation.py
index 50e8a93319..435c390594 100644
--- a/contrib/evaluation.py
+++ b/contrib/evaluation.py
@@ -261,6 +261,7 @@ def check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, rtol=1e-5):
             mask = DrefC == dis
             testcase.assertEqual(set(Iref[i, mask]), set(Inew[i, mask]))
 
+
 def check_ref_range_results(Lref, Dref, Iref,
                             Lnew, Dnew, Inew):
     """ compare range search results wrt. a reference result,
diff --git a/contrib/factory_tools.py b/contrib/factory_tools.py
index da90e986f8..cfad7c7b5c 100644
--- a/contrib/factory_tools.py
+++ b/contrib/factory_tools.py
@@ -56,6 +56,8 @@ def get_code_size(d, indexkey):
         return (d * 6 + 7) // 8
     elif indexkey == 'SQfp16':
         return d * 2
+    elif indexkey == 'SQbf16':
+        return d * 2
 
     mo = re.match('PCAR?(\\d+),(.*)$', indexkey)
     if mo:
@@ -101,12 +103,23 @@ def reverse_index_factory(index):
             return prefix + ",SQ8"
         if isinstance(index, faiss.IndexIVFPQ):
             return prefix + f",PQ{index.pq.M}x{index.pq.nbits}"
+        if isinstance(index, faiss.IndexIVFPQFastScan):
+            return prefix + f",PQ{index.pq.M}x{index.pq.nbits}fs"
 
     elif isinstance(index, faiss.IndexPreTransform):
-        assert index.chain.size() == 1
+        if index.chain.size() != 1:
+            raise NotImplementedError()
         vt = faiss.downcast_VectorTransform(index.chain.at(0))
         if isinstance(vt, faiss.OPQMatrix):
-            return f"OPQ{vt.M}_{vt.d_out},{reverse_index_factory(index.index)}"
+            prefix = f"OPQ{vt.M}_{vt.d_out}"
+        elif isinstance(vt, faiss.ITQTransform):
+            prefix = f"ITQ{vt.itq.d_out}"
+        elif isinstance(vt, faiss.PCAMatrix):
+            assert vt.eigen_power == 0
+            prefix = "PCA" + ("R" if vt.random_rotation else "") + str(vt.d_out)
+        else:
+            raise NotImplementedError()
+        return f"{prefix},{reverse_index_factory(index.index)}"
 
     elif isinstance(index, faiss.IndexHNSW):
         return f"HNSW{get_hnsw_M(index)}"
@@ -117,12 +130,19 @@ def reverse_index_factory(index):
     elif isinstance(index, faiss.IndexPQFastScan):
         return f"PQ{index.pq.M}x{index.pq.nbits}fs"
 
+    elif isinstance(index, faiss.IndexPQ):
+        return f"PQ{index.pq.M}x{index.pq.nbits}"
+
+    elif isinstance(index, faiss.IndexLSH):
+        return "LSH" + ("r" if index.rotate_data else "") + ("t" if index.train_thresholds else "")
+
     elif isinstance(index, faiss.IndexScalarQuantizer):
         sqtypes = {
             faiss.ScalarQuantizer.QT_8bit: "8",
             faiss.ScalarQuantizer.QT_4bit: "4",
             faiss.ScalarQuantizer.QT_6bit: "6",
             faiss.ScalarQuantizer.QT_fp16: "fp16",
+            faiss.ScalarQuantizer.QT_bf16: "bf16",
         }
         return f"SQ{sqtypes[index.sq.qtype]}"
 
diff --git a/contrib/ivf_tools.py b/contrib/ivf_tools.py
index 26ada886a1..1c10eb0386 100644
--- a/contrib/ivf_tools.py
+++ b/contrib/ivf_tools.py
@@ -32,6 +32,11 @@ def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
     Supports indexes with pretransforms (as opposed to the
     IndexIVF.search_preassigned, that cannot be applied with pretransform).
     """
+    if isinstance(index_ivf, faiss.IndexPreTransform):
+        assert index_ivf.chain.size() == 1, "chain must have only one component"
+        transform = faiss.downcast_VectorTransform(index_ivf.chain.at(0))
+        xq = transform.apply(xq)
+        index_ivf = faiss.downcast_index(index_ivf.index)
     n, d = xq.shape
     if isinstance(index_ivf, faiss.IndexBinaryIVF):
         d *= 8
diff --git a/contrib/ondisk.py b/contrib/ondisk.py
index 26a95f44f5..81ec71941c 100644
--- a/contrib/ondisk.py
+++ b/contrib/ondisk.py
@@ -11,7 +11,7 @@
 
 
 def merge_ondisk(
-    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str
+    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str, shift_ids=False
 ) -> None:
     """Add the contents of the indexes stored in shard_fnames into the index
     trained_index. The on-disk data is stored in ivfdata_fname"""
@@ -51,7 +51,7 @@ def merge_ondisk(
         ivf_vector.push_back(ivf)
 
     LOG.info("merge %d inverted lists " % ivf_vector.size())
-    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
+    ntotal = invlists.merge_from_multiple(ivf_vector.data(), ivf_vector.size(), shift_ids)
 
     # now replace the inverted lists in the output index
     index.ntotal = index_ivf.ntotal = ntotal
diff --git a/contrib/rpc.py b/contrib/rpc.py
index cf89862260..e7145b9815 100755
--- a/contrib/rpc.py
+++ b/contrib/rpc.py
@@ -7,9 +7,12 @@
 Simplistic RPC implementation.
 Exposes all functions of a Server object.
 
-Uses pickle for serialization and the socket interface.
+This code is for demonstration purposes only, and does not include certain
+security protections. It is not meant to be run on an untrusted network or
+in a production environment.
 """
 
+import importlib
 import os
 import pickle
 import sys
@@ -23,22 +26,21 @@
 # default
 PORT = 12032
 
+safe_modules = {
+    'numpy',
+    'numpy.core.multiarray',
+}
 
-#########################################################################
-# simple I/O functions
 
+class RestrictedUnpickler(pickle.Unpickler):
 
-def inline_send_handle(f, conn):
-    st = os.fstat(f.fileno())
-    size = st.st_size
-    pickle.dump(size, conn)
-    conn.write(f.read(size))
-
-
-def inline_send_string(s, conn):
-    size = len(s)
-    pickle.dump(size, conn)
-    conn.write(s)
+    def find_class(self, module, name):
+        # Only allow safe modules.
+        if module in safe_modules:
+            return getattr(importlib.import_module(module), name)
+        # Forbid everything else.
+        raise pickle.UnpicklingError("global '%s.%s' is forbidden" %
+                                     (module, name))
 
 
 class FileSock:
@@ -123,7 +125,7 @@ def one_function(self):
         """
 
         try:
-            (fname,args)=pickle.load(self.fs)
+            (fname, args) = RestrictedUnpickler(self.fs).load()
         except EOFError:
             raise ClientExit("read args")
         self.log("executing method %s"%(fname))
@@ -214,7 +216,7 @@ def generic_fun(self, fname, args):
         return self.get_result()
 
     def get_result(self):
-        (st, ret) = pickle.load(self.fs)
+        (st, ret) = RestrictedUnpickler(self.fs).load()
         if st!=None:
             raise ServerException(st)
         else:
diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index 790c295e48..18f136e914 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -33,7 +33,7 @@ def swig_ptr_from_UInt8Tensor(x):
     assert x.is_contiguous()
     assert x.dtype == torch.uint8
     return faiss.cast_integer_to_uint8_ptr(
-        x.storage().data_ptr() + x.storage_offset())
+        x.untyped_storage().data_ptr() + x.storage_offset())
 
 def swig_ptr_from_HalfTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -41,28 +41,28 @@ def swig_ptr_from_HalfTensor(x):
     assert x.dtype == torch.float16
     # no canonical half type in C/C++
     return faiss.cast_integer_to_void_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 2)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 2)
 
 def swig_ptr_from_FloatTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.float32
     return faiss.cast_integer_to_float_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IntTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_int_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IndicesTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_idx_t_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 8)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 8)
 
 @contextlib.contextmanager
 def using_stream(res, pytorch_stream=None):
@@ -492,8 +492,9 @@ def torch_replacement_sa_decode(self, codes, x=None):
         if issubclass(the_class, faiss.Index):
             handle_torch_Index(the_class)
 
+
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_raft=False):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
@@ -574,6 +575,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
+    args.use_raft = use_raft
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index ea75d5f94d..9ef9e0ab64 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 import numpy as np
 
 """
@@ -13,6 +14,8 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
+    if sys.byteorder == 'big':
+        a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
 
@@ -22,6 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
+    assert sys.byteorder != 'big'
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -33,7 +37,11 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
+    if sys.byteorder == 'big':
+        da = x[:4][::-1].copy()
+        d = da.view('int32')[0]
+    else:
+        d = x[:4].view('int32')[0]
     return x.reshape(-1, d + 4)[:, 4:]
 
 
@@ -42,6 +50,8 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
+    if sys.byteorder == 'big':
+        m1.byteswap(inplace=True)
     m1.tofile(fname)
 
 
diff --git a/demos/demo_imi_pq.cpp b/demos/demo_imi_pq.cpp
index a2af65e792..4fab0778d8 100644
--- a/demos/demo_imi_pq.cpp
+++ b/demos/demo_imi_pq.cpp
@@ -77,7 +77,6 @@ int main() {
     // the coarse quantizer should not be dealloced before the index
     // 4 = nb of bytes per code (d must be a multiple of this)
     // 8 = nb of bits per sub-code (almost always 8)
-    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
     faiss::IndexIVFPQ index(
             &coarse_quantizer, d, ncentroids, bytes_per_code, 8);
     index.quantizer_trains_alone = true;
diff --git a/demos/offline_ivf/README.md b/demos/offline_ivf/README.md
new file mode 100644
index 0000000000..df848ba0ab
--- /dev/null
+++ b/demos/offline_ivf/README.md
@@ -0,0 +1,52 @@
+
+# Offline IVF
+
+This folder contains the code for the offline ivf algorithm powered by faiss big batch search.
+
+Create a conda env:
+
+`conda create --name oivf python=3.10`
+
+`conda activate oivf`
+
+`conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4`
+
+`conda install tqdm`
+
+`conda install pyyaml`
+
+`conda install -c conda-forge submitit`
+
+
+## Run book
+
+1. Optionally shard your dataset (see create_sharded_dataset.py) and create the corresponding yaml file `config_ssnpp.yaml`. You can use `generate_config.py` by specifying the root directory of your dataset and the files with the data shards
+
+`python generate_config`
+
+2. Run the train index command
+
+`python run.py --command train_index --config config_ssnpp.yaml --xb ssnpp_1B`
+
+
+3. Run the index-shard command so it produces sharded indexes, required for the search step
+
+`python run.py --command index_shard --config config_ssnpp.yaml --xb ssnpp_1B`
+
+
+6. Send jobs to the cluster to run search
+
+`python run.py  --command search --config config_ssnpp.yaml --xb ssnpp_1B  --cluster_run --partition <PARTITION-NAME>`
+
+
+Remarks about the `search` command: it is assumed that the database vectors are the query vectors when performing the search step.
+a. If the query vectors are different than the database vectors, it should be passed in the xq argument
+b. A new dataset needs to be prepared (step 1) before passing it to the query vectors argument `–xq`
+
+`python run.py --command search --config config_ssnpp.yaml --xb ssnpp_1B --xq <QUERIES_DATASET_NAME>`
+
+
+6. We can always run the consistency-check for sanity checks!
+
+`python run.py  --command consistency_check--config config_ssnpp.yaml --xb ssnpp_1B`
+
diff --git a/demos/offline_ivf/__init__.py b/demos/offline_ivf/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/demos/offline_ivf/config_ssnpp.yaml b/demos/offline_ivf/config_ssnpp.yaml
new file mode 100644
index 0000000000..88e0394155
--- /dev/null
+++ b/demos/offline_ivf/config_ssnpp.yaml
@@ -0,0 +1,110 @@
+d: 256
+output: /checkpoint/marialomeli/offline_faiss/ssnpp
+index:
+  prod:
+  - 'IVF8192,PQ128'
+  non-prod:
+  - 'IVF16384,PQ128'
+  - 'IVF32768,PQ128'
+  - 'OPQ64_128,IVF4096,PQ64'
+nprobe:
+  prod:
+    - 512
+  non-prod:
+    - 256
+    - 128
+    - 1024
+    - 2048
+    - 4096
+    - 8192
+
+k: 50
+index_shard_size: 50000000
+query_batch_size: 50000000
+evaluation_sample: 10000
+training_sample: 1572864
+datasets:
+  ssnpp_1B:
+    root: /checkpoint/marialomeli/ssnpp_data
+    size: 1000000000
+    files:
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000000.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000001.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000002.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000003.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000004.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000005.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000006.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000007.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000008.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000009.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000010.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000011.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000012.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000013.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000014.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000015.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000016.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000017.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000018.npy
+      size: 50000000
+    - dtype: uint8
+      format: npy
+      name: ssnpp_0000000019.npy
+      size: 50000000
diff --git a/demos/offline_ivf/create_sharded_ssnpp_files.py b/demos/offline_ivf/create_sharded_ssnpp_files.py
new file mode 100644
index 0000000000..1dd22d2be8
--- /dev/null
+++ b/demos/offline_ivf/create_sharded_ssnpp_files.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import argparse
+import os
+
+
+def xbin_mmap(fname, dtype, maxn=-1):
+    """
+    Code from
+    https://github.com/harsha-simhadri/big-ann-benchmarks/blob/main/benchmark/dataset_io.py#L94
+    mmap the competition file format for a given type of items
+    """
+    n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
+    assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
+    if maxn > 0:
+        n = min(n, maxn)
+    return np.memmap(fname, dtype=dtype, mode="r", offset=8, shape=(n, d))
+
+
+def main(args: argparse.Namespace):
+    ssnpp_data = xbin_mmap(fname=args.filepath, dtype="uint8")
+    num_batches = ssnpp_data.shape[0] // args.data_batch
+    assert (
+        ssnpp_data.shape[0] % args.data_batch == 0
+    ), "num of embeddings per file should divide total num of embeddings"
+    for i in range(num_batches):
+        xb_batch = ssnpp_data[
+            i * args.data_batch:(i + 1) * args.data_batch, :
+        ]
+        filename = args.output_dir + f"/ssnpp_{(i):010}.npy"
+        np.save(filename, xb_batch)
+        print(f"File {filename} is saved!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_batch",
+        dest="data_batch",
+        type=int,
+        default=50000000,
+        help="Number of embeddings per file, should be a divisor of 1B",
+    )
+    parser.add_argument(
+        "--filepath",
+        dest="filepath",
+        type=str,
+        default="/datasets01/big-ann-challenge-data/FB_ssnpp/FB_ssnpp_database.u8bin",
+        help="path of 1B ssnpp database vectors' original file",
+    )
+    parser.add_argument(
+        "--filepath",
+        dest="output_dir",
+        type=str,
+        default="/checkpoint/marialomeli/ssnpp_data",
+        help="path to put sharded files",
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/demos/offline_ivf/dataset.py b/demos/offline_ivf/dataset.py
new file mode 100644
index 0000000000..f9e30009c5
--- /dev/null
+++ b/demos/offline_ivf/dataset.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import numpy as np
+import faiss
+from typing import List
+import random
+import logging
+from functools import lru_cache
+
+
+def create_dataset_from_oivf_config(cfg, ds_name):
+    normalise = cfg["normalise"] if "normalise" in cfg else False
+    return MultiFileVectorDataset(
+        cfg["datasets"][ds_name]["root"],
+        [
+            FileDescriptor(
+                f["name"], f["format"], np.dtype(f["dtype"]), f["size"]
+            )
+            for f in cfg["datasets"][ds_name]["files"]
+        ],
+        cfg["d"],
+        normalise,
+        cfg["datasets"][ds_name]["size"],
+    )
+
+
+@lru_cache(maxsize=100)
+def _memmap_vecs(
+    file_name: str, format: str, dtype: np.dtype, size: int, d: int
+) -> np.array:
+    """
+    If the file is in raw format, the file size will
+    be divisible by the dimensionality and by the size
+    of the data type.
+    Otherwise,the file contains a header and we assume
+    it is of .npy type. It the returns the memmapped file.
+    """
+
+    assert os.path.exists(file_name), f"file does not exist {file_name}"
+    if format == "raw":
+        fl = os.path.getsize(file_name)
+        nb = fl // d // dtype.itemsize
+        assert nb == size, f"{nb} is different than config's {size}"
+        assert fl == d * dtype.itemsize * nb  # no header
+        return np.memmap(file_name, shape=(nb, d), dtype=dtype, mode="r")
+    elif format == "npy":
+        vecs = np.load(file_name, mmap_mode="r")
+        assert vecs.shape[0] == size, f"size:{size},shape {vecs.shape[0]}"
+        assert vecs.shape[1] == d
+        assert vecs.dtype == dtype
+        return vecs
+    else:
+        ValueError("The file cannot be loaded in the current format.")
+
+
+class FileDescriptor:
+    def __init__(self, name: str, format: str, dtype: np.dtype, size: int):
+        self.name = name
+        self.format = format
+        self.dtype = dtype
+        self.size = size
+
+
+class MultiFileVectorDataset:
+    def __init__(
+        self,
+        root: str,
+        file_descriptors: List[FileDescriptor],
+        d: int,
+        normalize: bool,
+        size: int,
+    ):
+        assert os.path.exists(root)
+        self.root = root
+        self.file_descriptors = file_descriptors
+        self.d = d
+        self.normalize = normalize
+        self.size = size
+        self.file_offsets = [0]
+        t = 0
+        for f in self.file_descriptors:
+            xb = _memmap_vecs(
+                f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
+            )
+            t += xb.shape[0]
+            self.file_offsets.append(t)
+        assert (
+            t == self.size
+        ), "the sum of num of embeddings per file!=total num of embeddings"
+
+    def iterate(self, start: int, batch_size: int, dt: np.dtype):
+        buffer = np.empty(shape=(batch_size, self.d), dtype=dt)
+        rem = 0
+        for f in self.file_descriptors:
+            if start >= f.size:
+                start -= f.size
+                continue
+            logging.info(f"processing: {f.name}...")
+            xb = _memmap_vecs(
+                f"{self.root}/{f.name}",
+                f.format,
+                f.dtype,
+                f.size,
+                self.d,
+            )
+            if start > 0:
+                xb = xb[start:]
+                start = 0
+            req = min(batch_size - rem, xb.shape[0])
+            buffer[rem:rem + req] = xb[:req]
+            rem += req
+            if rem == batch_size:
+                if self.normalize:
+                    faiss.normalize_L2(buffer)
+                yield buffer.copy()
+                rem = 0
+            for i in range(req, xb.shape[0], batch_size):
+                j = i + batch_size
+                if j <= xb.shape[0]:
+                    tmp = xb[i:j].astype(dt)
+                    if self.normalize:
+                        faiss.normalize_L2(tmp)
+                    yield tmp
+                else:
+                    rem = xb.shape[0] - i
+                    buffer[:rem] = xb[i:j]
+        if rem > 0:
+            tmp = buffer[:rem]
+            if self.normalize:
+                faiss.normalize_L2(tmp)
+            yield tmp
+
+    def get(self, idx: List[int]):
+        n = len(idx)
+        fidx = np.searchsorted(self.file_offsets, idx, "right")
+        res = np.empty(shape=(len(idx), self.d), dtype=np.float32)
+        for r, id, fid in zip(range(n), idx, fidx):
+            assert fid > 0 and fid <= len(self.file_descriptors), f"{fid}"
+            f = self.file_descriptors[fid - 1]
+            # deferring normalization until after reading the vec
+            vecs = _memmap_vecs(
+                f"{self.root}/{f.name}", f.format, f.dtype, f.size, self.d
+            )
+            i = id - self.file_offsets[fid - 1]
+            assert i >= 0 and i < vecs.shape[0]
+            res[r, :] = vecs[i]  # TODO: find a faster way
+        if self.normalize:
+            faiss.normalize_L2(res)
+        return res
+
+    def sample(self, n, idx_fn, vecs_fn):
+        if vecs_fn and os.path.exists(vecs_fn):
+            vecs = np.load(vecs_fn)
+            assert vecs.shape == (n, self.d)
+            return vecs
+        if idx_fn and os.path.exists(idx_fn):
+            idx = np.load(idx_fn)
+            assert idx.size == n
+        else:
+            idx = np.array(sorted(random.sample(range(self.size), n)))
+            if idx_fn:
+                np.save(idx_fn, idx)
+        vecs = self.get(idx)
+        if vecs_fn:
+            np.save(vecs_fn, vecs)
+        return vecs
+
+    def get_first_n(self, n, dt):
+        assert n <= self.size
+        return next(self.iterate(0, n, dt))
diff --git a/demos/offline_ivf/generate_config.py b/demos/offline_ivf/generate_config.py
new file mode 100644
index 0000000000..b5a12645ab
--- /dev/null
+++ b/demos/offline_ivf/generate_config.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import os
+import yaml
+
+# with ssnpp sharded data
+root = "/checkpoint/marialomeli/ssnpp_data"
+file_names = [f"ssnpp_{i:010}.npy" for i in range(20)]
+d = 256
+dt = np.dtype(np.uint8)
+
+
+def read_embeddings(fp):
+    fl = os.path.getsize(fp)
+    nb = fl // d // dt.itemsize
+    print(nb)
+    if fl == d * dt.itemsize * nb:  # no header
+        return ("raw", np.memmap(fp, shape=(nb, d), dtype=dt, mode="r"))
+    else:  # assume npy
+        vecs = np.load(fp, mmap_mode="r")
+        assert vecs.shape[1] == d
+        assert vecs.dtype == dt
+        return ("npy", vecs)
+
+
+cfg = {}
+files = []
+size = 0
+for fn in file_names:
+    fp = f"{root}/{fn}"
+    assert os.path.exists(fp), f"{fp} is missing"
+    ft, xb = read_embeddings(fp)
+    files.append(
+        {"name": fn, "size": xb.shape[0], "dtype": dt.name, "format": ft}
+    )
+    size += xb.shape[0]
+
+cfg["size"] = size
+cfg["root"] = root
+cfg["d"] = d
+cfg["files"] = files
+print(yaml.dump(cfg))
diff --git a/demos/offline_ivf/offline_ivf.py b/demos/offline_ivf/offline_ivf.py
new file mode 100644
index 0000000000..eccd2b95cb
--- /dev/null
+++ b/demos/offline_ivf/offline_ivf.py
@@ -0,0 +1,890 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+import os
+from tqdm import tqdm, trange
+import sys
+import logging
+from faiss.contrib.ondisk import merge_ondisk
+from faiss.contrib.big_batch_search import big_batch_search
+from faiss.contrib.exhaustive_search import knn_ground_truth
+from faiss.contrib.evaluation import knn_intersection_measure
+from utils import (
+    get_intersection_cardinality_frequencies,
+    margin,
+    is_pretransform_index,
+)
+from dataset import create_dataset_from_oivf_config
+
+logging.basicConfig(
+    format=(
+        "%(asctime)s.%(msecs)03d %(levelname)-8s %(threadName)-12s %(message)s"
+    ),
+    level=logging.INFO,
+    datefmt="%Y-%m-%d %H:%M:%S",
+    force=True,
+)
+
+EMBEDDINGS_BATCH_SIZE: int = 100_000
+NUM_SUBSAMPLES: int = 100
+SMALL_DATA_SAMPLE: int = 10000
+
+
+class OfflineIVF:
+    def __init__(self, cfg, args, nprobe, index_factory_str):
+        self.input_d = cfg["d"]
+        self.dt = cfg["datasets"][args.xb]["files"][0]["dtype"]
+        assert self.input_d > 0
+        output_dir = cfg["output"]
+        assert os.path.exists(output_dir)
+        self.index_factory = index_factory_str
+        assert self.index_factory is not None
+        self.index_factory_fn = self.index_factory.replace(",", "_")
+        self.index_template_file = (
+            f"{output_dir}/{args.xb}/{self.index_factory_fn}.empty.faissindex"
+        )
+        logging.info(f"index template: {self.index_template_file}")
+
+        if not args.xq:
+            args.xq = args.xb
+
+        self.by_residual = True
+        if args.no_residuals:
+            self.by_residual = False
+
+        xb_output_dir = f"{output_dir}/{args.xb}"
+        if not os.path.exists(xb_output_dir):
+            os.makedirs(xb_output_dir)
+        xq_output_dir = f"{output_dir}/{args.xq}"
+        if not os.path.exists(xq_output_dir):
+            os.makedirs(xq_output_dir)
+        search_output_dir = f"{output_dir}/{args.xq}_in_{args.xb}"
+        if not os.path.exists(search_output_dir):
+            os.makedirs(search_output_dir)
+        self.knn_dir = f"{search_output_dir}/knn"
+        if not os.path.exists(self.knn_dir):
+            os.makedirs(self.knn_dir)
+        self.eval_dir = f"{search_output_dir}/eval"
+        if not os.path.exists(self.eval_dir):
+            os.makedirs(self.eval_dir)
+        self.index = {}  # to keep a reference to opened indices,
+        self.ivls = {}  # hstack inverted lists,
+        self.index_shards = {}  # and index shards
+        self.index_shard_prefix = (
+            f"{xb_output_dir}/{self.index_factory_fn}.shard_"
+        )
+        self.xq_index_shard_prefix = (
+            f"{xq_output_dir}/{self.index_factory_fn}.shard_"
+        )
+        self.index_file = (  # TODO: added back temporarily for evaluate, handle name of non-sharded index file and remove.
+            f"{xb_output_dir}/{self.index_factory_fn}.faissindex"
+        )
+        self.xq_index_file = (
+            f"{xq_output_dir}/{self.index_factory_fn}.faissindex"
+        )
+        self.training_sample = cfg["training_sample"]
+        self.evaluation_sample = cfg["evaluation_sample"]
+        self.xq_ds = create_dataset_from_oivf_config(cfg, args.xq)
+        self.xb_ds = create_dataset_from_oivf_config(cfg, args.xb)
+        file_descriptors = self.xq_ds.file_descriptors
+        self.file_sizes = [fd.size for fd in file_descriptors]
+        self.shard_size = cfg["index_shard_size"]  # ~100GB
+        self.nshards = self.xb_ds.size // self.shard_size
+        if self.xb_ds.size % self.shard_size != 0:
+            self.nshards += 1
+        self.xq_nshards = self.xq_ds.size // self.shard_size
+        if self.xq_ds.size % self.shard_size != 0:
+            self.xq_nshards += 1
+        self.nprobe = nprobe
+        assert self.nprobe > 0, "Invalid nprobe parameter."
+        if "deduper" in cfg:
+            self.deduper = cfg["deduper"]
+            self.deduper_codec_fn = [
+                f"{xb_output_dir}/deduper_codec_{codec.replace(',', '_')}"
+                for codec in self.deduper
+            ]
+            self.deduper_idx_fn = [
+                f"{xb_output_dir}/deduper_idx_{codec.replace(',', '_')}"
+                for codec in self.deduper
+            ]
+        else:
+            self.deduper = None
+        self.k = cfg["k"]
+        assert self.k > 0, "Invalid number of neighbours parameter."
+        self.knn_output_file_suffix = (
+            f"{self.index_factory_fn}_np{self.nprobe}.npy"
+        )
+
+        fp = 32
+        if self.dt == "float16":
+            fp = 16
+
+        self.xq_bs = cfg["query_batch_size"]
+        if "metric" in cfg:
+            self.metric = eval(f'faiss.{cfg["metric"]}')
+        else:
+            self.metric = faiss.METRIC_L2
+
+        if "evaluate_by_margin" in cfg:
+            self.evaluate_by_margin = cfg["evaluate_by_margin"]
+        else:
+            self.evaluate_by_margin = False
+
+        os.system("grep -m1 'model name' < /proc/cpuinfo")
+        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
+        os.system("nvidia-smi")
+        os.system("nvcc --version")
+
+        self.knn_queries_memory_limit = 4 * 1024 * 1024 * 1024  # 4 GB
+        self.knn_vectors_memory_limit = 8 * 1024 * 1024 * 1024  # 8 GB
+
+    def input_stats(self):
+        """
+        Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
+        """
+        xb_sample = self.xb_ds.get_first_n(self.training_sample, np.float32)
+        logging.info(f"input shape: {xb_sample.shape}")
+        logging.info("running MatrixStats on training sample...")
+        logging.info(faiss.MatrixStats(xb_sample).comments)
+        logging.info("done")
+
+    def dedupe(self):
+        logging.info(self.deduper)
+        if self.deduper is None:
+            logging.info("No deduper configured")
+            return
+        codecs = []
+        codesets = []
+        idxs = []
+        for factory, filename in zip(self.deduper, self.deduper_codec_fn):
+            if os.path.exists(filename):
+                logging.info(f"loading trained dedupe codec: {filename}")
+                codec = faiss.read_index(filename)
+            else:
+                logging.info(f"training dedupe codec: {factory}")
+                codec = faiss.index_factory(self.input_d, factory)
+                xb_sample = np.unique(
+                    self.xb_ds.get_first_n(100_000, np.float32), axis=0
+                )
+                faiss.ParameterSpace().set_index_parameter(codec, "verbose", 1)
+                codec.train(xb_sample)
+                logging.info(f"writing trained dedupe codec: {filename}")
+                faiss.write_index(codec, filename)
+            codecs.append(codec)
+            codesets.append(faiss.CodeSet(codec.sa_code_size()))
+            idxs.append(np.empty((0,), dtype=np.uint32))
+        bs = 1_000_000
+        i = 0
+        for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
+            for j in range(len(codecs)):
+                codec, codeset, idx = codecs[j], codesets[j], idxs[j]
+                uniq = codeset.insert(codec.sa_encode(buffer))
+                idxs[j] = np.append(
+                    idx,
+                    np.arange(i, i + buffer.shape[0], dtype=np.uint32)[uniq],
+                )
+            i += buffer.shape[0]
+        for idx, filename in zip(idxs, self.deduper_idx_fn):
+            logging.info(f"writing {filename}, shape: {idx.shape}")
+            np.save(filename, idx)
+        logging.info("done")
+
+    def train_index(self):
+        """
+        Trains the index using a subsample of the first chunk of data in the database and saves it in the template file (with no vectors added).
+        """
+        assert not os.path.exists(self.index_template_file), (
+            "The train command has been ran, the index template file already"
+            " exists."
+        )
+        xb_sample = np.unique(
+            self.xb_ds.get_first_n(self.training_sample, np.float32), axis=0
+        )
+        logging.info(f"input shape: {xb_sample.shape}")
+        index = faiss.index_factory(
+            self.input_d, self.index_factory, self.metric
+        )
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        index_ivf.by_residual = True
+        faiss.ParameterSpace().set_index_parameter(index, "verbose", 1)
+        logging.info("running training...")
+        index.train(xb_sample)
+        logging.info(f"writing trained index {self.index_template_file}...")
+        faiss.write_index(index, self.index_template_file)
+        logging.info("done")
+
+    def _iterate_transformed(self, ds, start, batch_size, dt):
+        assert os.path.exists(self.index_template_file)
+        index = faiss.read_index(self.index_template_file)
+        if is_pretransform_index(index):
+            vt = index.chain.at(0)  # fetch pretransform
+            for buffer in ds.iterate(start, batch_size, dt):
+                yield vt.apply(buffer)
+        else:
+            for buffer in ds.iterate(start, batch_size, dt):
+                yield buffer
+
+    def index_shard(self):
+        assert os.path.exists(self.index_template_file)
+        index = faiss.read_index(self.index_template_file)
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        assert self.nprobe <= index_ivf.quantizer.ntotal, (
+            f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
+            f" to retrieve {self.nprobe} neighbours, check."
+        )
+        cpu_quantizer = index_ivf.quantizer
+        gpu_quantizer = faiss.index_cpu_to_all_gpus(cpu_quantizer)
+
+        for i in range(0, self.nshards):
+            sfn = f"{self.index_shard_prefix}{i}"
+            try:
+                index.reset()
+                index_ivf.quantizer = gpu_quantizer
+                with open(sfn, "xb"):
+                    start = i * self.shard_size
+                    jj = 0
+                    embeddings_batch_size = min(
+                        EMBEDDINGS_BATCH_SIZE, self.shard_size
+                    )
+                    assert (
+                        self.shard_size % embeddings_batch_size == 0
+                        or EMBEDDINGS_BATCH_SIZE % embeddings_batch_size == 0
+                    ), (
+                        f"the shard size {self.shard_size} and embeddings"
+                        f" shard size  {EMBEDDINGS_BATCH_SIZE} are not"
+                        " divisible"
+                    )
+
+                    for xb_j in tqdm(
+                        self._iterate_transformed(
+                            self.xb_ds,
+                            start,
+                            embeddings_batch_size,
+                            np.float32,
+                        ),
+                        file=sys.stdout,
+                    ):
+                        if is_pretransform_index(index):
+                            assert xb_j.shape[1] == index.chain.at(0).d_out
+                            index_ivf.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
+                        else:
+                            assert xb_j.shape[1] == index.d
+                            index.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
+                        jj += xb_j.shape[0]
+                        logging.info(jj)
+                        assert (
+                            jj <= self.shard_size
+                        ), f"jj {jj} and shard_zide {self.shard_size}"
+                        if jj == self.shard_size:
+                            break
+                logging.info(f"writing {sfn}...")
+                index_ivf.quantizer = cpu_quantizer
+                faiss.write_index(index, sfn)
+            except FileExistsError:
+                logging.info(f"skipping shard: {i}")
+                continue
+        logging.info("done")
+
+    def merge_index(self):
+        ivf_file = f"{self.index_file}.ivfdata"
+
+        assert os.path.exists(self.index_template_file)
+        assert not os.path.exists(
+            ivf_file
+        ), f"file with embeddings data {ivf_file} not found, check."
+        assert not os.path.exists(self.index_file)
+        index = faiss.read_index(self.index_template_file)
+        block_fnames = [
+            f"{self.index_shard_prefix}{i}" for i in range(self.nshards)
+        ]
+        for fn in block_fnames:
+            assert os.path.exists(fn)
+        logging.info(block_fnames)
+        logging.info("merging...")
+        merge_ondisk(index, block_fnames, ivf_file)
+        logging.info("writing index...")
+        faiss.write_index(index, self.index_file)
+        logging.info("done")
+
+    def _cached_search(
+        self,
+        sample,
+        xq_ds,
+        xb_ds,
+        idx_file,
+        vecs_file,
+        I_file,
+        D_file,
+        index_file=None,
+        nprobe=None,
+    ):
+        if not os.path.exists(I_file):
+            assert not os.path.exists(I_file), f"file {I_file} does not exist "
+            assert not os.path.exists(D_file), f"file {D_file} does not exist "
+            xq = xq_ds.sample(sample, idx_file, vecs_file)
+
+            if index_file:
+                D, I = self._index_nonsharded_search(index_file, xq, nprobe)
+            else:
+                logging.info("ground truth computations")
+                db_iterator = xb_ds.iterate(0, 100_000, np.float32)
+                D, I = knn_ground_truth(
+                    xq, db_iterator, self.k, metric_type=self.metric
+                )
+                assert np.amin(I) >= 0
+
+            np.save(I_file, I)
+            np.save(D_file, D)
+        else:
+            assert os.path.exists(idx_file), f"file {idx_file} does not exist "
+            assert os.path.exists(
+                vecs_file
+            ), f"file {vecs_file} does not exist "
+            assert os.path.exists(I_file), f"file {I_file} does not exist "
+            assert os.path.exists(D_file), f"file {D_file} does not exist "
+            I = np.load(I_file)
+            D = np.load(D_file)
+        assert I.shape == (sample, self.k), f"{I_file} shape mismatch"
+        assert D.shape == (sample, self.k), f"{D_file} shape mismatch"
+        return (D, I)
+
+    def _index_search(self, index_shard_prefix, xq, nprobe):
+        assert nprobe is not None
+        logging.info(
+            f"open sharded index: {index_shard_prefix}, {self.nshards}"
+        )
+        index = self._open_sharded_index(index_shard_prefix)
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        logging.info(f"setting nprobe to {nprobe}")
+        index_ivf.nprobe = nprobe
+        return index.search(xq, self.k)
+
+    def _index_nonsharded_search(self, index_file, xq, nprobe):
+        assert nprobe is not None
+        logging.info(f"index {index_file}")
+        assert os.path.exists(index_file), f"file {index_file} does not exist "
+        index = faiss.read_index(index_file, faiss.IO_FLAG_ONDISK_SAME_DIR)
+        logging.info(f"index size {index.ntotal} ")
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        logging.info(f"setting nprobe to {nprobe}")
+        index_ivf.nprobe = nprobe
+        return index.search(xq, self.k)
+
+    def _refine_distances(self, xq_ds, idx, xb_ds, I):
+        xq = xq_ds.get(idx).repeat(self.k, axis=0)
+        xb = xb_ds.get(I.reshape(-1))
+        if self.metric == faiss.METRIC_INNER_PRODUCT:
+            return (xq * xb).sum(axis=1).reshape(I.shape)
+        elif self.metric == faiss.METRIC_L2:
+            return ((xq - xb) ** 2).sum(axis=1).reshape(I.shape)
+        else:
+            raise ValueError(f"metric not supported {self.metric}")
+
+    def evaluate(self):
+        self._evaluate(
+            self.index_factory_fn,
+            self.index_file,
+            self.xq_index_file,
+            self.nprobe,
+        )
+
+    def _evaluate(self, index_factory_fn, index_file, xq_index_file, nprobe):
+        idx_a_file = f"{self.eval_dir}/idx_a.npy"
+        idx_b_gt_file = f"{self.eval_dir}/idx_b_gt.npy"
+        idx_b_ann_file = (
+            f"{self.eval_dir}/idx_b_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        vecs_a_file = f"{self.eval_dir}/vecs_a.npy"
+        vecs_b_gt_file = f"{self.eval_dir}/vecs_b_gt.npy"
+        vecs_b_ann_file = (
+            f"{self.eval_dir}/vecs_b_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        D_a_gt_file = f"{self.eval_dir}/D_a_gt.npy"
+        D_a_ann_file = (
+            f"{self.eval_dir}/D_a_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        D_a_ann_refined_file = f"{self.eval_dir}/D_a_ann_refined_{index_factory_fn}_np{nprobe}.npy"
+        D_b_gt_file = f"{self.eval_dir}/D_b_gt.npy"
+        D_b_ann_file = (
+            f"{self.eval_dir}/D_b_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        D_b_ann_gt_file = (
+            f"{self.eval_dir}/D_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
+        )
+        I_a_gt_file = f"{self.eval_dir}/I_a_gt.npy"
+        I_a_ann_file = (
+            f"{self.eval_dir}/I_a_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        I_b_gt_file = f"{self.eval_dir}/I_b_gt.npy"
+        I_b_ann_file = (
+            f"{self.eval_dir}/I_b_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+        I_b_ann_gt_file = (
+            f"{self.eval_dir}/I_b_ann_gt_{index_factory_fn}_np{nprobe}.npy"
+        )
+        margin_gt_file = f"{self.eval_dir}/margin_gt.npy"
+        margin_refined_file = (
+            f"{self.eval_dir}/margin_refined_{index_factory_fn}_np{nprobe}.npy"
+        )
+        margin_ann_file = (
+            f"{self.eval_dir}/margin_ann_{index_factory_fn}_np{nprobe}.npy"
+        )
+
+        logging.info("exact search forward")
+        # xq -> xb AKA a -> b
+        D_a_gt, I_a_gt = self._cached_search(
+            self.evaluation_sample,
+            self.xq_ds,
+            self.xb_ds,
+            idx_a_file,
+            vecs_a_file,
+            I_a_gt_file,
+            D_a_gt_file,
+        )
+        idx_a = np.load(idx_a_file)
+
+        logging.info("approximate search forward")
+        D_a_ann, I_a_ann = self._cached_search(
+            self.evaluation_sample,
+            self.xq_ds,
+            self.xb_ds,
+            idx_a_file,
+            vecs_a_file,
+            I_a_ann_file,
+            D_a_ann_file,
+            index_file,
+            nprobe,
+        )
+
+        logging.info(
+            "calculate refined distances on approximate search forward"
+        )
+        if os.path.exists(D_a_ann_refined_file):
+            D_a_ann_refined = np.load(D_a_ann_refined_file)
+            assert D_a_ann.shape == D_a_ann_refined.shape
+        else:
+            D_a_ann_refined = self._refine_distances(
+                self.xq_ds, idx_a, self.xb_ds, I_a_ann
+            )
+            np.save(D_a_ann_refined_file, D_a_ann_refined)
+
+        if self.evaluate_by_margin:
+            k_extract = self.k
+            margin_threshold = 1.05
+            logging.info(
+                "exact search backward from the k_extract NN results of"
+                " forward search"
+            )
+            # xb -> xq AKA b -> a
+            D_a_b_gt = D_a_gt[:, :k_extract].ravel()
+            idx_b_gt = I_a_gt[:, :k_extract].ravel()
+            assert len(idx_b_gt) == self.evaluation_sample * k_extract
+            np.save(idx_b_gt_file, idx_b_gt)
+            # exact search
+            D_b_gt, _ = self._cached_search(
+                len(idx_b_gt),
+                self.xb_ds,
+                self.xq_ds,
+                idx_b_gt_file,
+                vecs_b_gt_file,
+                I_b_gt_file,
+                D_b_gt_file,
+            )  # xb and xq ^^^ are inverted
+
+            logging.info("margin on exact search")
+            margin_gt = margin(
+                self.evaluation_sample,
+                idx_a,
+                idx_b_gt,
+                D_a_b_gt,
+                D_a_gt,
+                D_b_gt,
+                self.k,
+                k_extract,
+                margin_threshold,
+            )
+            np.save(margin_gt_file, margin_gt)
+
+            logging.info(
+                "exact search backward from the k_extract NN results of"
+                " approximate forward search"
+            )
+            D_a_b_refined = D_a_ann_refined[:, :k_extract].ravel()
+            idx_b_ann = I_a_ann[:, :k_extract].ravel()
+            assert len(idx_b_ann) == self.evaluation_sample * k_extract
+            np.save(idx_b_ann_file, idx_b_ann)
+            # exact search
+            D_b_ann_gt, _ = self._cached_search(
+                len(idx_b_ann),
+                self.xb_ds,
+                self.xq_ds,
+                idx_b_ann_file,
+                vecs_b_ann_file,
+                I_b_ann_gt_file,
+                D_b_ann_gt_file,
+            )  # xb and xq ^^^ are inverted
+
+            logging.info("refined margin on approximate search")
+            margin_refined = margin(
+                self.evaluation_sample,
+                idx_a,
+                idx_b_ann,
+                D_a_b_refined,
+                D_a_gt,  # not D_a_ann_refined(!)
+                D_b_ann_gt,
+                self.k,
+                k_extract,
+                margin_threshold,
+            )
+            np.save(margin_refined_file, margin_refined)
+
+            D_b_ann, I_b_ann = self._cached_search(
+                len(idx_b_ann),
+                self.xb_ds,
+                self.xq_ds,
+                idx_b_ann_file,
+                vecs_b_ann_file,
+                I_b_ann_file,
+                D_b_ann_file,
+                xq_index_file,
+                nprobe,
+            )
+
+            D_a_b_ann = D_a_ann[:, :k_extract].ravel()
+
+            logging.info("approximate search margin")
+
+            margin_ann = margin(
+                self.evaluation_sample,
+                idx_a,
+                idx_b_ann,
+                D_a_b_ann,
+                D_a_ann,
+                D_b_ann,
+                self.k,
+                k_extract,
+                margin_threshold,
+            )
+            np.save(margin_ann_file, margin_ann)
+
+        logging.info("intersection")
+        logging.info(I_a_gt)
+        logging.info(I_a_ann)
+
+        for i in range(1, self.k + 1):
+            logging.info(
+                f"{i}: {knn_intersection_measure(I_a_gt[:,:i], I_a_ann[:,:i])}"
+            )
+
+        logging.info(f"mean of gt distances: {D_a_gt.mean()}")
+        logging.info(f"mean of approx distances: {D_a_ann.mean()}")
+        logging.info(f"mean of refined distances: {D_a_ann_refined.mean()}")
+
+        logging.info("intersection cardinality frequencies")
+        logging.info(get_intersection_cardinality_frequencies(I_a_ann, I_a_gt))
+
+        logging.info("done")
+        pass
+
+    def _knn_function(self, xq, xb, k, metric, thread_id=None):
+        try:
+            return faiss.knn_gpu(
+                self.all_gpu_resources[thread_id],
+                xq,
+                xb,
+                k,
+                metric=metric,
+                device=thread_id,
+                vectorsMemoryLimit=self.knn_vectors_memory_limit,
+                queriesMemoryLimit=self.knn_queries_memory_limit,
+            )
+        except Exception:
+            logging.info(f"knn_function failed: {xq.shape}, {xb.shape}")
+            raise
+
+    def _coarse_quantize(self, index_ivf, xq, nprobe):
+        assert nprobe <= index_ivf.quantizer.ntotal
+        quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
+        bs = 100_000
+        nq = len(xq)
+        q_assign = np.empty((nq, nprobe), dtype="int32")
+        for i0 in trange(0, nq, bs):
+            i1 = min(nq, i0 + bs)
+            _, q_assign_i = quantizer.search(xq[i0:i1], nprobe)
+            q_assign[i0:i1] = q_assign_i
+        return q_assign
+
+    def search(self):
+        logging.info(f"search: {self.knn_dir}")
+        slurm_job_id = os.environ.get("SLURM_JOB_ID")
+
+        ngpu = faiss.get_num_gpus()
+        logging.info(f"number of gpus: {ngpu}")
+        self.all_gpu_resources = [
+            faiss.StandardGpuResources() for _ in range(ngpu)
+        ]
+        self._knn_function(
+            np.zeros((10, 10), dtype=np.float16),
+            np.zeros((10, 10), dtype=np.float16),
+            self.k,
+            metric=self.metric,
+            thread_id=0,
+        )
+
+        index = self._open_sharded_index()
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        logging.info(f"setting nprobe to {self.nprobe}")
+        index_ivf.nprobe = self.nprobe
+        # quantizer = faiss.index_cpu_to_all_gpus(index_ivf.quantizer)
+        for i in range(0, self.xq_ds.size, self.xq_bs):
+            Ifn = f"{self.knn_dir}/I{(i):010}_{self.knn_output_file_suffix}"
+            Dfn = f"{self.knn_dir}/D_approx{(i):010}_{self.knn_output_file_suffix}"
+            CPfn = f"{self.knn_dir}/CP{(i):010}_{self.knn_output_file_suffix}"
+
+            if slurm_job_id:
+                worker_record = (
+                    self.knn_dir
+                    + f"/record_{(i):010}_{self.knn_output_file_suffix}.txt"
+                )
+                if not os.path.exists(worker_record):
+                    logging.info(
+                        f"creating record file {worker_record} and saving job"
+                        f" id: {slurm_job_id}"
+                    )
+                    with open(worker_record, "w") as h:
+                        h.write(slurm_job_id)
+                else:
+                    old_slurm_id = open(worker_record, "r").read()
+                    logging.info(
+                        f"old job slurm id {old_slurm_id} and current job id:"
+                        f" {slurm_job_id}"
+                    )
+                    if old_slurm_id == slurm_job_id:
+                        if os.path.getsize(Ifn) == 0:
+                            logging.info(
+                                f"cleaning up zero length files {Ifn} and"
+                                f" {Dfn}"
+                            )
+                            os.remove(Ifn)
+                            os.remove(Dfn)
+
+            try:
+                if is_pretransform_index(index):
+                    d = index.chain.at(0).d_out
+                else:
+                    d = self.input_d
+                with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
+                    xq_i = np.empty(
+                        shape=(self.xq_bs, d), dtype=np.float16
+                    )
+                    q_assign = np.empty(
+                        (self.xq_bs, self.nprobe), dtype=np.int32
+                    )
+                    j = 0
+                    quantizer = faiss.index_cpu_to_all_gpus(
+                        index_ivf.quantizer
+                    )
+                    for xq_i_j in tqdm(
+                        self._iterate_transformed(
+                            self.xq_ds, i, min(100_000, self.xq_bs), np.float16
+                        ),
+                        file=sys.stdout,
+                    ):
+                        xq_i[j:j + xq_i_j.shape[0]] = xq_i_j
+                        (
+                            _,
+                            q_assign[j:j + xq_i_j.shape[0]],
+                        ) = quantizer.search(xq_i_j, self.nprobe)
+                        j += xq_i_j.shape[0]
+                        assert j <= xq_i.shape[0]
+                        if j == xq_i.shape[0]:
+                            break
+                    xq_i = xq_i[:j]
+                    q_assign = q_assign[:j]
+
+                    assert q_assign.shape == (xq_i.shape[0], index_ivf.nprobe)
+                    del quantizer
+                    logging.info(f"computing: {Ifn}")
+                    logging.info(f"computing: {Dfn}")
+                    prefetch_threads = faiss.get_num_gpus()
+                    D_ann, I = big_batch_search(
+                        index_ivf,
+                        xq_i,
+                        self.k,
+                        verbose=10,
+                        method="knn_function",
+                        knn=self._knn_function,
+                        threaded=faiss.get_num_gpus() * 8,
+                        use_float16=True,
+                        prefetch_threads=prefetch_threads,
+                        computation_threads=faiss.get_num_gpus(),
+                        q_assign=q_assign,
+                        checkpoint=CPfn,
+                        checkpoint_freq=7200,  # in seconds
+                    )
+                    assert (
+                        np.amin(I) >= 0
+                    ), f"{I}, there exists negative indices, check"
+                    logging.info(f"saving: {Ifn}")
+                    np.save(f, I)
+                    logging.info(f"saving: {Dfn}")
+                    np.save(g, D_ann)
+
+                    if os.path.exists(CPfn):
+                        logging.info(f"removing: {CPfn}")
+                        os.remove(CPfn)
+
+            except FileExistsError:
+                logging.info(f"skipping {Ifn}, already exists")
+                logging.info(f"skipping {Dfn}, already exists")
+                continue
+
+    def _open_index_shard(self, fn):
+        if fn in self.index_shards:
+            index_shard = self.index_shards[fn]
+        else:
+            logging.info(f"open index shard: {fn}")
+            index_shard = faiss.read_index(
+                fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
+            )
+            self.index_shards[fn] = index_shard
+        return index_shard
+
+    def _open_sharded_index(self, index_shard_prefix=None):
+        if index_shard_prefix is None:
+            index_shard_prefix = self.index_shard_prefix
+        if index_shard_prefix in self.index:
+            return self.index[index_shard_prefix]
+        assert os.path.exists(
+            self.index_template_file
+        ), f"file {self.index_template_file} does not exist "
+        logging.info(f"open index template: {self.index_template_file}")
+        index = faiss.read_index(self.index_template_file)
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        ilv = faiss.InvertedListsPtrVector()
+        for i in range(self.nshards):
+            fn = f"{index_shard_prefix}{i}"
+            assert os.path.exists(fn), f"file {fn} does not exist "
+            logging.info(fn)
+            index_shard = self._open_index_shard(fn)
+            il = faiss.downcast_index(
+                faiss.extract_index_ivf(index_shard)
+            ).invlists
+            ilv.push_back(il)
+        hsil = faiss.HStackInvertedLists(ilv.size(), ilv.data())
+        index_ivf.replace_invlists(hsil, False)
+        self.ivls[index_shard_prefix] = hsil
+        self.index[index_shard_prefix] = index
+        return index
+
+    def index_shard_stats(self):
+        for i in range(self.nshards):
+            fn = f"{self.index_shard_prefix}{i}"
+            assert os.path.exists(fn)
+            index = faiss.read_index(
+                fn, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
+            )
+            index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+            il = index_ivf.invlists
+            il.print_stats()
+
+    def index_stats(self):
+        index = self._open_sharded_index()
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        il = index_ivf.invlists
+        list_sizes = [il.list_size(i) for i in range(il.nlist)]
+        logging.info(np.max(list_sizes))
+        logging.info(np.mean(list_sizes))
+        logging.info(np.argmax(list_sizes))
+        logging.info("index_stats:")
+        il.print_stats()
+
+    def consistency_check(self):
+        logging.info("consistency-check")
+
+        logging.info("index template...")
+
+        assert os.path.exists(self.index_template_file)
+        index = faiss.read_index(self.index_template_file)
+
+        offset = 0  # 2**24
+        assert self.shard_size > offset + SMALL_DATA_SAMPLE
+
+        logging.info("index shards...")
+        for i in range(self.nshards):
+            r = i * self.shard_size + offset
+            xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
+            fn = f"{self.index_shard_prefix}{i}"
+            assert os.path.exists(fn), f"There is no index shard file {fn}"
+            index = self._open_index_shard(fn)
+            index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+            index_ivf.nprobe = 1
+            _, I = index.search(xb, 100)
+            for j in range(SMALL_DATA_SAMPLE):
+                assert np.where(I[j] == j + r)[0].size > 0, (
+                    f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
+                    f" {self.shard_size}"
+                )
+
+        logging.info("merged index...")
+        index = self._open_sharded_index()
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        index_ivf.nprobe = 1
+        for i in range(self.nshards):
+            r = i * self.shard_size + offset
+            xb = next(self.xb_ds.iterate(r, SMALL_DATA_SAMPLE, np.float32))
+            _, I = index.search(xb, 100)
+            for j in range(SMALL_DATA_SAMPLE):
+                assert np.where(I[j] == j + r)[0].size > 0, (
+                    f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
+                    f" {self.shard_size}")
+
+        logging.info("search results...")
+        index_ivf.nprobe = self.nprobe
+        for i in range(0, self.xq_ds.size, self.xq_bs):
+            Ifn = f"{self.knn_dir}/I{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
+            assert os.path.exists(Ifn)
+            assert os.path.getsize(Ifn) > 0, f"The file {Ifn} is empty."
+            logging.info(Ifn)
+            I = np.load(Ifn, mmap_mode="r")
+
+            assert I.shape[1] == self.k
+            assert I.shape[0] == min(self.xq_bs, self.xq_ds.size - i)
+            assert np.all(I[:, 1] >= 0)
+
+            Dfn = f"{self.knn_dir}/D_approx{i:010}_{self.index_factory_fn}_np{self.nprobe}.npy"
+            assert os.path.exists(Dfn)
+            assert os.path.getsize(Dfn) > 0, f"The file {Dfn} is empty."
+            logging.info(Dfn)
+            D = np.load(Dfn, mmap_mode="r")
+            assert D.shape == I.shape
+
+            xq = next(self.xq_ds.iterate(i, SMALL_DATA_SAMPLE, np.float32))
+            D_online, I_online = index.search(xq, self.k)
+            assert (
+                np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size
+                / (self.k * SMALL_DATA_SAMPLE)
+                > 0.95
+            ), (
+                "the ratio is"
+                f" {np.where(I[:SMALL_DATA_SAMPLE] == I_online)[0].size / (self.k * SMALL_DATA_SAMPLE)}"
+            )
+            assert np.allclose(
+                D[:SMALL_DATA_SAMPLE].sum(axis=1),
+                D_online.sum(axis=1),
+                rtol=0.01,
+            ), (
+                "the difference is"
+                f" {D[:SMALL_DATA_SAMPLE].sum(axis=1), D_online.sum(axis=1)}"
+            )
+
+        logging.info("done")
diff --git a/demos/offline_ivf/run.py b/demos/offline_ivf/run.py
new file mode 100644
index 0000000000..dfa831d6f0
--- /dev/null
+++ b/demos/offline_ivf/run.py
@@ -0,0 +1,218 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from utils import (
+    load_config,
+    add_group_args,
+)
+from offline_ivf import OfflineIVF
+import faiss
+from typing import List, Callable, Dict
+import submitit
+
+
+def join_lists_in_dict(poss: List[str]) -> List[str]:
+    """
+    Joins two lists of prod and non-prod values, checking if the prod value is already included.
+    If there is no non-prod list, it returns the prod list.
+    """
+    if "non-prod" in poss.keys():
+        all_poss = poss["non-prod"]
+        if poss["prod"][-1] not in poss["non-prod"]:
+            all_poss += poss["prod"]
+        return all_poss
+    else:
+        return poss["prod"]
+
+
+def main(
+    args: argparse.Namespace,
+    cfg: Dict[str, str],
+    nprobe: int,
+    index_factory_str: str,
+) -> None:
+    oivf = OfflineIVF(cfg, args, nprobe, index_factory_str)
+    eval(f"oivf.{args.command}()")
+
+
+def process_options_and_run_jobs(args: argparse.Namespace) -> None:
+    """
+    If "--cluster_run", it launches an array of jobs to the cluster using the submitit library for all the index strings. In
+    the case of evaluate, it launches a job for each index string and nprobe pair. Otherwise, it launches a single job
+    that is ran locally with the prod values for index string and nprobe.
+    """
+
+    cfg = load_config(args.config)
+    index_strings = cfg["index"]
+    nprobes = cfg["nprobe"]
+    if args.command == "evaluate":
+        if args.cluster_run:
+            all_nprobes = join_lists_in_dict(nprobes)
+            all_index_strings = join_lists_in_dict(index_strings)
+            for index_factory_str in all_index_strings:
+                for nprobe in all_nprobes:
+                    launch_job(main, args, cfg, nprobe, index_factory_str)
+        else:
+            launch_job(
+                main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
+            )
+    else:
+        if args.cluster_run:
+            all_index_strings = join_lists_in_dict(index_strings)
+            for index_factory_str in all_index_strings:
+                launch_job(
+                    main, args, cfg, nprobes["prod"][-1], index_factory_str
+                )
+        else:
+            launch_job(
+                main, args, cfg, nprobes["prod"][-1], index_strings["prod"][-1]
+            )
+
+
+def launch_job(
+    func: Callable,
+    args: argparse.Namespace,
+    cfg: Dict[str, str],
+    n_probe: int,
+    index_str: str,
+) -> None:
+    """
+    Launches an array of slurm jobs to the cluster using the submitit library.
+    """
+
+    if args.cluster_run:
+        assert args.num_nodes >= 1
+        executor = submitit.AutoExecutor(folder=args.logs_dir)
+
+        executor.update_parameters(
+            nodes=args.num_nodes,
+            gpus_per_node=args.gpus_per_node,
+            cpus_per_task=args.cpus_per_task,
+            tasks_per_node=args.tasks_per_node,
+            name=args.job_name,
+            slurm_partition=args.partition,
+            slurm_time=70 * 60,
+        )
+        if args.slurm_constraint:
+            executor.update_parameters(slurm_constraint=args.slurm_constrain)
+
+        job = executor.submit(func, args, cfg, n_probe, index_str)
+        print(f"Job id: {job.job_id}")
+    else:
+        func(args, cfg, n_probe, index_str)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group("general")
+
+    add_group_args(group, "--command", required=True, help="command to run")
+    add_group_args(
+        group,
+        "--config",
+        required=True,
+        help="config yaml with the dataset specs",
+    )
+    add_group_args(
+        group, "--nt", type=int, default=96, help="nb search threads"
+    )
+    add_group_args(
+        group,
+        "--no_residuals",
+        action="store_false",
+        help="set index.by_residual to False during train index.",
+    )
+
+    group = parser.add_argument_group("slurm_job")
+
+    add_group_args(
+        group,
+        "--cluster_run",
+        action="store_true",
+        help=" if True, runs in cluster",
+    )
+    add_group_args(
+        group,
+        "--job_name",
+        type=str,
+        default="oivf",
+        help="cluster job name",
+    )
+    add_group_args(
+        group,
+        "--num_nodes",
+        type=str,
+        default=1,
+        help="num of nodes per job",
+    )
+    add_group_args(
+        group,
+        "--tasks_per_node",
+        type=int,
+        default=1,
+        help="tasks per job",
+    )
+
+    add_group_args(
+        group,
+        "--gpus_per_node",
+        type=int,
+        default=8,
+        help="cluster job name",
+    )
+    add_group_args(
+        group,
+        "--cpus_per_task",
+        type=int,
+        default=80,
+        help="cluster job name",
+    )
+
+    add_group_args(
+        group,
+        "--logs_dir",
+        type=str,
+        default="/checkpoint/marialomeli/offline_faiss/logs",
+        help="cluster job name",
+    )
+
+    add_group_args(
+        group,
+        "--slurm_constraint",
+        type=str,
+        default=None,
+        help="can be volta32gb for the fair cluster",
+    )
+
+    add_group_args(
+        group,
+        "--partition",
+        type=str,
+        default="learnlab",
+        help="specify which partition to use if ran on cluster with job arrays",
+        choices=[
+            "learnfair",
+            "devlab",
+            "scavenge",
+            "learnlab",
+            "nllb",
+            "seamless",
+            "seamless_medium",
+            "learnaccel",
+            "onellm_low",
+            "learn",
+            "scavenge",
+        ],
+    )
+
+    group = parser.add_argument_group("dataset")
+
+    add_group_args(group, "--xb", required=True, help="database vectors")
+    add_group_args(group, "--xq", help="query vectors")
+
+    args = parser.parse_args()
+    print("args:", args)
+    faiss.omp_set_num_threads(args.nt)
+    process_options_and_run_jobs(args=args)
diff --git a/demos/offline_ivf/tests/test_iterate_input.py b/demos/offline_ivf/tests/test_iterate_input.py
new file mode 100644
index 0000000000..3f59071102
--- /dev/null
+++ b/demos/offline_ivf/tests/test_iterate_input.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import unittest
+from typing import List
+from utils import load_config
+from tests.testing_utils import TestDataCreator
+import tempfile
+from dataset import create_dataset_from_oivf_config
+
+DIMENSION: int = 768
+SMALL_FILE_SIZES: List[int] = [100, 210, 450]
+LARGE_FILE_SIZES: List[int] = [1253, 3459, 890]
+TEST_BATCH_SIZE: int = 500
+SMALL_SAMPLE_SIZE: int = 1000
+NUM_FILES: int = 3
+
+
+class TestUtilsMethods(unittest.TestCase):
+    """
+    Unit tests for iterate and decreasing_matrix methods.
+    """
+
+    def test_iterate_input_file_smaller_than_batch(self):
+        """
+        Tests when batch size is larger than the file size.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=DIMENSION,
+                data_type=np.float16,
+                file_sizes=SMALL_FILE_SIZES,
+            )
+            data_creator.create_test_data()
+            args = data_creator.setup_cli()
+            cfg = load_config(args.config)
+            db_iterator = create_dataset_from_oivf_config(
+                cfg, args.xb
+            ).iterate(0, TEST_BATCH_SIZE, np.float32)
+
+            for i in range(len(SMALL_FILE_SIZES) - 1):
+                vecs = next(db_iterator)
+                if i != 1:
+                    self.assertEqual(vecs.shape[0], TEST_BATCH_SIZE)
+                else:
+                    self.assertEqual(
+                        vecs.shape[0], sum(SMALL_FILE_SIZES) - TEST_BATCH_SIZE
+                    )
+
+    def test_iterate_input_file_larger_than_batch(self):
+        """
+        Tests when batch size is smaller than the file size.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=DIMENSION,
+                data_type=np.float16,
+                file_sizes=LARGE_FILE_SIZES,
+            )
+            data_creator.create_test_data()
+            args = data_creator.setup_cli()
+            cfg = load_config(args.config)
+            db_iterator = create_dataset_from_oivf_config(
+                cfg, args.xb
+            ).iterate(0, TEST_BATCH_SIZE, np.float32)
+
+            for i in range(len(LARGE_FILE_SIZES) - 1):
+                vecs = next(db_iterator)
+                if i != 9:
+                    self.assertEqual(vecs.shape[0], TEST_BATCH_SIZE)
+                else:
+                    self.assertEqual(
+                        vecs.shape[0],
+                        sum(LARGE_FILE_SIZES) - TEST_BATCH_SIZE * 9,
+                    )
+
+    def test_get_vs_iterate(self) -> None:
+        """
+        Loads vectors with iterator and get, and checks that they match, non-aligned by file size case.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=DIMENSION,
+                data_type=np.float16,
+                file_size=SMALL_SAMPLE_SIZE,
+                num_files=NUM_FILES,
+                normalise=True,
+            )
+            data_creator.create_test_data()
+            args = data_creator.setup_cli()
+            cfg = load_config(args.config)
+            ds = create_dataset_from_oivf_config(cfg, args.xb)
+            vecs_by_iterator = np.vstack(list(ds.iterate(0, 317, np.float32)))
+            self.assertEqual(
+                vecs_by_iterator.shape[0], SMALL_SAMPLE_SIZE * NUM_FILES
+            )
+            vecs_by_get = ds.get(list(range(vecs_by_iterator.shape[0])))
+            self.assertTrue(np.all(vecs_by_iterator == vecs_by_get))
+
+    def test_iterate_back(self) -> None:
+        """
+        Loads vectors with iterator and get, and checks that they match, non-aligned by file size case.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=DIMENSION,
+                data_type=np.float16,
+                file_size=SMALL_SAMPLE_SIZE,
+                num_files=NUM_FILES,
+                normalise=True,
+            )
+            data_creator.create_test_data()
+            args = data_creator.setup_cli()
+            cfg = load_config(args.config)
+            ds = create_dataset_from_oivf_config(cfg, args.xb)
+            vecs_by_iterator = np.vstack(list(ds.iterate(0, 317, np.float32)))
+            self.assertEqual(
+                vecs_by_iterator.shape[0], SMALL_SAMPLE_SIZE * NUM_FILES
+            )
+            vecs_chunk = np.vstack(
+                [
+                    next(ds.iterate(i, 543, np.float32))
+                    for i in range(0, SMALL_SAMPLE_SIZE * NUM_FILES, 543)
+                ]
+            )
+            self.assertTrue(np.all(vecs_by_iterator == vecs_chunk))
diff --git a/demos/offline_ivf/tests/test_offline_ivf.py b/demos/offline_ivf/tests/test_offline_ivf.py
new file mode 100644
index 0000000000..557a0b37dd
--- /dev/null
+++ b/demos/offline_ivf/tests/test_offline_ivf.py
@@ -0,0 +1,288 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import unittest
+from utils import load_config
+import pathlib as pl
+import tempfile
+from typing import List
+from tests.testing_utils import TestDataCreator
+from run import process_options_and_run_jobs
+
+KNN_RESULTS_FILE: str = (
+    "/my_test_data_in_my_test_data/knn/I0000000000_IVF256_PQ4_np2.npy"
+)
+
+A_INDEX_FILES: List[str] = [
+    "I_a_gt.npy",
+    "D_a_gt.npy",
+    "vecs_a.npy",
+    "D_a_ann_IVF256_PQ4_np2.npy",
+    "I_a_ann_IVF256_PQ4_np2.npy",
+    "D_a_ann_refined_IVF256_PQ4_np2.npy",
+]
+
+A_INDEX_OPQ_FILES: List[str] = [
+    "I_a_gt.npy",
+    "D_a_gt.npy",
+    "vecs_a.npy",
+    "D_a_ann_OPQ4_IVF256_PQ4_np200.npy",
+    "I_a_ann_OPQ4_IVF256_PQ4_np200.npy",
+    "D_a_ann_refined_OPQ4_IVF256_PQ4_np200.npy",
+]
+
+
+class TestOIVF(unittest.TestCase):
+    """
+    Unit tests for OIVF. Some of these unit tests first copy the required test data objects and puts them in the tempdir created by the context manager.
+    """
+
+    def assert_file_exists(self, filepath: str) -> None:
+        path = pl.Path(filepath)
+        self.assertEqual((str(path), path.is_file()), (str(path), True))
+
+    def test_consistency_check(self) -> None:
+        """
+        Test the OIVF consistency check step, that it throws if no other steps have been ran.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float16,
+                index_factory=["OPQ4,IVF256,PQ4"],
+                training_sample=9984,
+                num_files=3,
+                file_size=10000,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("consistency_check")
+            self.assertRaises(
+                AssertionError, process_options_and_run_jobs, test_args
+            )
+
+    def test_train_index(self) -> None:
+        """
+        Test the OIVF train index step, that it correctly produces the empty.faissindex template file.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float16,
+                index_factory=["OPQ4,IVF256,PQ4"],
+                training_sample=9984,
+                num_files=3,
+                file_size=10000,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            cfg = load_config(test_args.config)
+            process_options_and_run_jobs(test_args)
+            empty_index = (
+                cfg["output"]
+                + "/my_test_data/"
+                + cfg["index"]["prod"][-1].replace(",", "_")
+                + ".empty.faissindex"
+            )
+            self.assert_file_exists(empty_index)
+
+    def test_index_shard_equal_file_sizes(self) -> None:
+        """
+        Test the case where the shard size is a divisor of the database size and it is equal to the first file size.
+        """
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            index_shard_size = 10000
+            num_files = 3
+            file_size = 10000
+            xb_ds_size = num_files * file_size
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float16,
+                index_factory=["IVF256,PQ4"],
+                training_sample=9984,
+                num_files=num_files,
+                file_size=file_size,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+                index_shard_size=index_shard_size,
+                query_batch_size=1000,
+                evaluation_sample=100,
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("index_shard")
+            cfg = load_config(test_args.config)
+            process_options_and_run_jobs(test_args)
+            num_shards = xb_ds_size // index_shard_size
+            if xb_ds_size % index_shard_size != 0:
+                num_shards += 1
+            print(f"number of shards:{num_shards}")
+            for i in range(num_shards):
+                index_shard_file = (
+                    cfg["output"]
+                    + "/my_test_data/"
+                    + cfg["index"]["prod"][-1].replace(",", "_")
+                    + f".shard_{i}"
+                )
+                self.assert_file_exists(index_shard_file)
+
+    def test_index_shard_unequal_file_sizes(self) -> None:
+        """
+        Test the case where the shard size is not a divisor of the database size and is greater than the first file size.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            file_sizes = [20000, 15001, 13990]
+            xb_ds_size = sum(file_sizes)
+            index_shard_size = 30000
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float16,
+                index_factory=["IVF256,PQ4"],
+                training_sample=9984,
+                file_sizes=file_sizes,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+                index_shard_size=index_shard_size,
+                evaluation_sample=100,
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("index_shard")
+            cfg = load_config(test_args.config)
+            process_options_and_run_jobs(test_args)
+            num_shards = xb_ds_size // index_shard_size
+            if xb_ds_size % index_shard_size != 0:
+                num_shards += 1
+            print(f"number of shards:{num_shards}")
+            for i in range(num_shards):
+                index_shard_file = (
+                    cfg["output"]
+                    + "/my_test_data/"
+                    + cfg["index"]["prod"][-1].replace(",", "_")
+                    + f".shard_{i}"
+                )
+                self.assert_file_exists(index_shard_file)
+
+    def test_search(self) -> None:
+        """
+        Test search step using test data objects to bypass dependencies on previous steps.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            num_files = 3
+            file_size = 10000
+            query_batch_size = 10000
+            total_batches = num_files * file_size // query_batch_size
+            if num_files * file_size % query_batch_size != 0:
+                total_batches += 1
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float32,
+                index_factory=["IVF256,PQ4"],
+                training_sample=9984,
+                num_files=3,
+                file_size=10000,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+                index_shard_size=10000,
+                query_batch_size=query_batch_size,
+                evaluation_sample=100,
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("index_shard")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("search")
+            cfg = load_config(test_args.config)
+            process_options_and_run_jobs(test_args)
+            # TODO: add check that there are number of batches total of files
+            knn_file = cfg["output"] + KNN_RESULTS_FILE
+            self.assert_file_exists(knn_file)
+
+    def test_evaluate_without_margin(self) -> None:
+        """
+        Test evaluate step using test data objects, no margin evaluation, single index.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float32,
+                index_factory=["IVF256,PQ4"],
+                training_sample=9984,
+                num_files=3,
+                file_size=10000,
+                nprobe=2,
+                k=2,
+                metric="METRIC_L2",
+                index_shard_size=10000,
+                query_batch_size=10000,
+                evaluation_sample=100,
+                with_queries_ds=True,
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("index_shard")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("merge_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("evaluate")
+            process_options_and_run_jobs(test_args)
+            common_path = tmpdirname + "/my_queries_data_in_my_test_data/eval/"
+            for filename in A_INDEX_FILES:
+                file_to_check = common_path + "/" + filename
+                self.assert_file_exists(file_to_check)
+
+    def test_evaluate_without_margin_OPQ(self) -> None:
+        """
+        Test evaluate step using test data objects, no margin evaluation, single index.
+        """
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            data_creator = TestDataCreator(
+                tempdir=tmpdirname,
+                dimension=8,
+                data_type=np.float32,
+                index_factory=["OPQ4,IVF256,PQ4"],
+                training_sample=9984,
+                num_files=3,
+                file_size=10000,
+                nprobe=200,
+                k=2,
+                metric="METRIC_L2",
+                index_shard_size=10000,
+                query_batch_size=10000,
+                evaluation_sample=100,
+                with_queries_ds=True,
+            )
+            data_creator.create_test_data()
+            test_args = data_creator.setup_cli("train_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("index_shard")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("merge_index")
+            process_options_and_run_jobs(test_args)
+            test_args = data_creator.setup_cli("evaluate")
+            process_options_and_run_jobs(test_args)
+            common_path = tmpdirname + "/my_queries_data_in_my_test_data/eval/"
+            for filename in A_INDEX_OPQ_FILES:
+                file_to_check = common_path + filename
+                self.assert_file_exists(file_to_check)
diff --git a/demos/offline_ivf/tests/testing_utils.py b/demos/offline_ivf/tests/testing_utils.py
new file mode 100644
index 0000000000..34751f278a
--- /dev/null
+++ b/demos/offline_ivf/tests/testing_utils.py
@@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import yaml
+import numpy as np
+from typing import Dict, List, Optional
+
+OIVF_TEST_ARGS: List[str] = [
+    "--config",
+    "--xb",
+    "--xq",
+    "--command",
+    "--cluster_run",
+    "--no_residuals",
+]
+
+
+def get_test_parser(args) -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    for arg in args:
+        parser.add_argument(arg)
+    return parser
+
+
+class TestDataCreator:
+    def __init__(
+        self,
+        tempdir: str,
+        dimension: int,
+        data_type: np.dtype,
+        index_factory: Optional[List] = ["OPQ4,IVF256,PQ4"],
+        training_sample: Optional[int] = 9984,
+        index_shard_size: Optional[int] = 1000,
+        query_batch_size: Optional[int] = 1000,
+        evaluation_sample: Optional[int] = 100,
+        num_files: Optional[int] = None,
+        file_size: Optional[int] = None,
+        file_sizes: Optional[List] = None,
+        nprobe: Optional[int] = 64,
+        k: Optional[int] = 10,
+        metric: Optional[str] = "METRIC_L2",
+        normalise: Optional[bool] = False,
+        with_queries_ds: Optional[bool] = False,
+        evaluate_by_margin: Optional[bool] = False,
+    ) -> None:
+        self.tempdir = tempdir
+        self.dimension = dimension
+        self.data_type = np.dtype(data_type).name
+        self.index_factory = {"prod": index_factory}
+        if file_size and num_files:
+            self.file_sizes = [file_size for _ in range(num_files)]
+        elif file_sizes:
+            self.file_sizes = file_sizes
+        else:
+            raise ValueError("no file sizes provided")
+        self.num_files = len(self.file_sizes)
+        self.training_sample = training_sample
+        self.index_shard_size = index_shard_size
+        self.query_batch_size = query_batch_size
+        self.evaluation_sample = evaluation_sample
+        self.nprobe = {"prod": [nprobe]}
+        self.k = k
+        self.metric = metric
+        self.normalise = normalise
+        self.config_file = self.tempdir + "/config_test.yaml"
+        self.ds_name = "my_test_data"
+        self.qs_name = "my_queries_data"
+        self.evaluate_by_margin = evaluate_by_margin
+        self.with_queries_ds = with_queries_ds
+
+    def create_test_data(self) -> None:
+        datafiles = self._create_data_files()
+        files_info = []
+
+        for i, file in enumerate(datafiles):
+            files_info.append(
+                {
+                    "dtype": self.data_type,
+                    "format": "npy",
+                    "name": file,
+                    "size": self.file_sizes[i],
+                }
+            )
+
+        config_for_yaml = {
+            "d": self.dimension,
+            "output": self.tempdir,
+            "index": self.index_factory,
+            "nprobe": self.nprobe,
+            "k": self.k,
+            "normalise": self.normalise,
+            "metric": self.metric,
+            "training_sample": self.training_sample,
+            "evaluation_sample": self.evaluation_sample,
+            "index_shard_size": self.index_shard_size,
+            "query_batch_size": self.query_batch_size,
+            "datasets": {
+                self.ds_name: {
+                    "root": self.tempdir,
+                    "size": sum(self.file_sizes),
+                    "files": files_info,
+                }
+            },
+        }
+        if self.evaluate_by_margin:
+            config_for_yaml["evaluate_by_margin"] = self.evaluate_by_margin
+        q_datafiles = self._create_data_files("my_q_data")
+        q_files_info = []
+
+        for i, file in enumerate(q_datafiles):
+            q_files_info.append(
+                {
+                    "dtype": self.data_type,
+                    "format": "npy",
+                    "name": file,
+                    "size": self.file_sizes[i],
+                }
+            )
+        if self.with_queries_ds:
+            config_for_yaml["datasets"][self.qs_name] = {
+                "root": self.tempdir,
+                "size": sum(self.file_sizes),
+                "files": q_files_info,
+            }
+
+        self._create_config_yaml(config_for_yaml)
+
+    def setup_cli(self, command="consistency_check") -> argparse.Namespace:
+        parser = get_test_parser(OIVF_TEST_ARGS)
+
+        if self.with_queries_ds:
+            return parser.parse_args(
+                [
+                    "--xb",
+                    self.ds_name,
+                    "--config",
+                    self.config_file,
+                    "--command",
+                    command,
+                    "--xq",
+                    self.qs_name,
+                ]
+            )
+        return parser.parse_args(
+            [
+                "--xb",
+                self.ds_name,
+                "--config",
+                self.config_file,
+                "--command",
+                command,
+            ]
+        )
+
+    def _create_data_files(self, name_of_file="my_data") -> List[str]:
+        """
+        Creates a dataset "my_test_data" with number of files (num_files), using padding in the files
+        name. If self.with_queries is True, it adds an extra dataset "my_queries_data" with the same number of files
+        as the "my_test_data". The default name for embeddings files is "my_data" + <padding>.npy.
+        """
+        filenames = []
+        for i, file_size in enumerate(self.file_sizes):
+            # np.random.seed(i)
+            db_vectors = np.random.random((file_size, self.dimension)).astype(
+                self.data_type
+            )
+            filename = name_of_file + f"{i:02}" + ".npy"
+            filenames.append(filename)
+            np.save(self.tempdir + "/" + filename, db_vectors)
+        return filenames
+
+    def _create_config_yaml(self, dict_file: Dict[str, str]) -> None:
+        """
+        Creates a yaml file in dir (can be a temporary dir for tests).
+        """
+        filename = self.tempdir + "/config_test.yaml"
+        with open(filename, "w") as file:
+            yaml.dump(dict_file, file, default_flow_style=False)
diff --git a/demos/offline_ivf/utils.py b/demos/offline_ivf/utils.py
new file mode 100644
index 0000000000..378af00c30
--- /dev/null
+++ b/demos/offline_ivf/utils.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import os
+from typing import Dict
+import yaml
+import faiss
+from faiss.contrib.datasets import SyntheticDataset
+
+
+def load_config(config):
+    assert os.path.exists(config)
+    with open(config, "r") as f:
+        return yaml.safe_load(f)
+
+
+def faiss_sanity_check():
+    ds = SyntheticDataset(256, 0, 100, 100)
+    xq = ds.get_queries()
+    xb = ds.get_database()
+    index_cpu = faiss.IndexFlat(ds.d)
+    index_gpu = faiss.index_cpu_to_all_gpus(index_cpu)
+    index_cpu.add(xb)
+    index_gpu.add(xb)
+    D_cpu, I_cpu = index_cpu.search(xq, 10)
+    D_gpu, I_gpu = index_gpu.search(xq, 10)
+    assert np.all(I_cpu == I_gpu), "faiss sanity check failed"
+    assert np.all(np.isclose(D_cpu, D_gpu)), "faiss sanity check failed"
+
+
+def margin(sample, idx_a, idx_b, D_a_b, D_a, D_b, k, k_extract, threshold):
+    """
+    two datasets: xa, xb; n = number of pairs
+    idx_a - (np,) - query vector ids in xa
+    idx_b - (np,) - query vector ids in xb
+    D_a_b - (np,) - pairwise distances between xa[idx_a] and xb[idx_b]
+    D_a - (np, k) - distances between vectors xa[idx_a] and corresponding nearest neighbours in xb
+    D_b - (np, k) - distances between vectors xb[idx_b] and corresponding nearest neighbours in xa
+    k - k nearest neighbours used for margin
+    k_extract - number of nearest neighbours of each query in xb we consider for margin calculation and filtering
+    threshold - margin threshold
+    """
+
+    n = sample
+    nk = n * k_extract
+    assert idx_a.shape == (n,)
+    idx_a_k = idx_a.repeat(k_extract)
+    assert idx_a_k.shape == (nk,)
+    assert idx_b.shape == (nk,)
+    assert D_a_b.shape == (nk,)
+    assert D_a.shape == (n, k)
+    assert D_b.shape == (nk, k)
+    mean_a = np.mean(D_a, axis=1)
+    assert mean_a.shape == (n,)
+    mean_a_k = mean_a.repeat(k_extract)
+    assert mean_a_k.shape == (nk,)
+    mean_b = np.mean(D_b, axis=1)
+    assert mean_b.shape == (nk,)
+    margin = 2 * D_a_b / (mean_a_k + mean_b)
+    above_threshold = margin > threshold
+    print(np.count_nonzero(above_threshold))
+    print(idx_a_k[above_threshold])
+    print(idx_b[above_threshold])
+    print(margin[above_threshold])
+    return margin
+
+
+def add_group_args(group, *args, **kwargs):
+    return group.add_argument(*args, **kwargs)
+
+
+def get_intersection_cardinality_frequencies(
+    I: np.ndarray, I_gt: np.ndarray
+) -> Dict[int, int]:
+    """
+    Computes the frequencies for the cardinalities of the intersection of neighbour indices.
+    """
+    nq = I.shape[0]
+    res = []
+    for ell in range(nq):
+        res.append(len(np.intersect1d(I[ell, :], I_gt[ell, :])))
+    values, counts = np.unique(res, return_counts=True)
+    return dict(zip(values, counts))
+
+
+def is_pretransform_index(index):
+    if index.__class__ == faiss.IndexPreTransform:
+        assert hasattr(index, "chain")
+        return True
+    else:
+        assert not hasattr(index, "chain")
+        return False
diff --git a/demos/rocksdb_ivf/CMakeLists.txt b/demos/rocksdb_ivf/CMakeLists.txt
new file mode 100644
index 0000000000..7bc3fe079c
--- /dev/null
+++ b/demos/rocksdb_ivf/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+project (ROCKSDB_IVF)
+set(CMAKE_BUILD_TYPE Debug)
+find_package(faiss REQUIRED)
+find_package(RocksDB REQUIRED)
+
+add_executable(demo_rocksdb_ivf demo_rocksdb_ivf.cpp RocksDBInvertedLists.cpp)
+target_link_libraries(demo_rocksdb_ivf faiss RocksDB::rocksdb)
diff --git a/demos/rocksdb_ivf/README.md b/demos/rocksdb_ivf/README.md
new file mode 100644
index 0000000000..cf29ee2fde
--- /dev/null
+++ b/demos/rocksdb_ivf/README.md
@@ -0,0 +1,23 @@
+# Storing Faiss inverted lists in RocksDB
+
+Demo of storing the inverted lists of any IVF index in RocksDB or any similar key-value store which supports the prefix scan operation.
+
+# How to build
+
+We use conda to create the build environment for simplicity. Only tested on Linux x86.
+
+```
+conda create -n rocksdb_ivf
+conda activate rocksdb_ivf
+conda install pytorch::faiss-cpu conda-forge::rocksdb cmake make gxx_linux-64 sysroot_linux-64
+cd ~/faiss/demos/rocksdb_ivf
+cmake -B build .
+make -C build -j$(nproc)
+```
+
+# Run the example
+
+```
+cd ~/faiss/demos/rocksdb_ivf/build
+./rocksdb_ivf test_db
+```
diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.cpp b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
new file mode 100644
index 0000000000..8d692f0b54
--- /dev/null
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
@@ -0,0 +1,109 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "RocksDBInvertedLists.h"
+
+#include <faiss/impl/FaissAssert.h>
+
+using namespace faiss;
+
+namespace faiss_rocksdb {
+
+RocksDBInvertedListsIterator::RocksDBInvertedListsIterator(
+        rocksdb::DB* db,
+        size_t list_no,
+        size_t code_size)
+        : InvertedListsIterator(),
+          it(db->NewIterator(rocksdb::ReadOptions())),
+          list_no(list_no),
+          code_size(code_size),
+          codes(code_size) {
+    it->Seek(rocksdb::Slice(
+            reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
+}
+
+bool RocksDBInvertedListsIterator::is_available() const {
+    return it->Valid() &&
+            it->key().starts_with(rocksdb::Slice(
+                    reinterpret_cast<const char*>(&list_no), sizeof(size_t)));
+}
+
+void RocksDBInvertedListsIterator::next() {
+    it->Next();
+}
+
+std::pair<idx_t, const uint8_t*> RocksDBInvertedListsIterator::
+        get_id_and_codes() {
+    idx_t id =
+            *reinterpret_cast<const idx_t*>(&it->key().data()[sizeof(size_t)]);
+    assert(code_size == it->value().size());
+    return {id, reinterpret_cast<const uint8_t*>(it->value().data())};
+}
+
+RocksDBInvertedLists::RocksDBInvertedLists(
+        const char* db_directory,
+        size_t nlist,
+        size_t code_size)
+        : InvertedLists(nlist, code_size) {
+    use_iterator = true;
+
+    rocksdb::Options options;
+    options.create_if_missing = true;
+    rocksdb::DB* db;
+    rocksdb::Status status = rocksdb::DB::Open(options, db_directory, &db);
+    db_ = std::unique_ptr<rocksdb::DB>(db);
+    assert(status.ok());
+}
+
+size_t RocksDBInvertedLists::list_size(size_t /*list_no*/) const {
+    FAISS_THROW_MSG("list_size is not supported");
+}
+
+const uint8_t* RocksDBInvertedLists::get_codes(size_t /*list_no*/) const {
+    FAISS_THROW_MSG("get_codes is not supported");
+}
+
+const idx_t* RocksDBInvertedLists::get_ids(size_t /*list_no*/) const {
+    FAISS_THROW_MSG("get_ids is not supported");
+}
+
+size_t RocksDBInvertedLists::add_entries(
+        size_t list_no,
+        size_t n_entry,
+        const idx_t* ids,
+        const uint8_t* code) {
+    rocksdb::WriteOptions wo;
+    std::vector<char> key(sizeof(size_t) + sizeof(idx_t));
+    memcpy(key.data(), &list_no, sizeof(size_t));
+    for (size_t i = 0; i < n_entry; i++) {
+        memcpy(key.data() + sizeof(size_t), ids + i, sizeof(idx_t));
+        rocksdb::Status status = db_->Put(
+                wo,
+                rocksdb::Slice(key.data(), key.size()),
+                rocksdb::Slice(
+                        reinterpret_cast<const char*>(code + i * code_size),
+                        code_size));
+        assert(status.ok());
+    }
+    return 0; // ignored
+}
+
+void RocksDBInvertedLists::update_entries(
+        size_t /*list_no*/,
+        size_t /*offset*/,
+        size_t /*n_entry*/,
+        const idx_t* /*ids*/,
+        const uint8_t* /*code*/) {
+    FAISS_THROW_MSG("update_entries is not supported");
+}
+
+void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
+    FAISS_THROW_MSG("resize is not supported");
+}
+
+InvertedListsIterator* RocksDBInvertedLists::get_iterator(
+        size_t list_no,
+        void* inverted_list_context) const {
+    return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
+}
+
+} // namespace faiss_rocksdb
diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.h b/demos/rocksdb_ivf/RocksDBInvertedLists.h
new file mode 100644
index 0000000000..f9d70a4f97
--- /dev/null
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.h
@@ -0,0 +1,60 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <faiss/invlists/InvertedLists.h>
+
+#include <rocksdb/db.h>
+
+namespace faiss_rocksdb {
+
+struct RocksDBInvertedListsIterator : faiss::InvertedListsIterator {
+    RocksDBInvertedListsIterator(
+            rocksdb::DB* db,
+            size_t list_no,
+            size_t code_size);
+    virtual bool is_available() const override;
+    virtual void next() override;
+    virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes() override;
+
+   private:
+    std::unique_ptr<rocksdb::Iterator> it;
+    size_t list_no;
+    size_t code_size;
+    std::vector<uint8_t> codes; // buffer for returning codes in next()
+};
+
+struct RocksDBInvertedLists : faiss::InvertedLists {
+    RocksDBInvertedLists(
+            const char* db_directory,
+            size_t nlist,
+            size_t code_size);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t* get_codes(size_t list_no) const override;
+    const faiss::idx_t* get_ids(size_t list_no) const override;
+
+    size_t add_entries(
+            size_t list_no,
+            size_t n_entry,
+            const faiss::idx_t* ids,
+            const uint8_t* code) override;
+
+    void update_entries(
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const faiss::idx_t* ids,
+            const uint8_t* code) override;
+
+    void resize(size_t list_no, size_t new_size) override;
+
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context) const override;
+
+   private:
+    std::unique_ptr<rocksdb::DB> db_;
+};
+
+} // namespace faiss_rocksdb
diff --git a/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp b/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
new file mode 100644
index 0000000000..72cf39eb03
--- /dev/null
+++ b/demos/rocksdb_ivf/demo_rocksdb_ivf.cpp
@@ -0,0 +1,81 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <exception>
+#include <iostream>
+#include <memory>
+
+#include "RocksDBInvertedLists.h"
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/random.h>
+
+using namespace faiss;
+
+int main(int argc, char* argv[]) {
+    try {
+        if (argc != 2) {
+            std::cerr << "missing db directory argument" << std::endl;
+            return -1;
+        }
+        size_t d = 128;
+        size_t nlist = 100;
+        IndexFlatL2 quantizer(d);
+        IndexIVFFlat index(&quantizer, d, nlist);
+        faiss_rocksdb::RocksDBInvertedLists ril(
+                argv[1], nlist, index.code_size);
+        index.replace_invlists(&ril, false);
+
+        idx_t nb = 10000;
+        std::vector<float> xb(d * nb);
+        float_rand(xb.data(), d * nb, 12345);
+        std::vector<idx_t> xids(nb);
+        std::iota(xids.begin(), xids.end(), 0);
+
+        index.train(nb, xb.data());
+        index.add_with_ids(nb, xb.data(), xids.data());
+
+        idx_t nq = 20; // nb;
+        index.nprobe = 2;
+
+        std::cout << "search" << std::endl;
+        idx_t k = 5;
+        std::vector<float> distances(nq * k);
+        std::vector<idx_t> labels(nq * k, -1);
+        index.search(
+                nq, xb.data(), k, distances.data(), labels.data(), nullptr);
+
+        for (idx_t iq = 0; iq < nq; iq++) {
+            std::cout << iq << ": ";
+            for (auto j = 0; j < k; j++) {
+                std::cout << labels[iq * k + j] << " " << distances[iq * k + j]
+                          << " | ";
+            }
+            std::cout << std::endl;
+        }
+
+        std::cout << std::endl << "range search" << std::endl;
+        float range = 15.0f;
+        RangeSearchResult result(nq);
+        index.range_search(nq, xb.data(), range, &result);
+
+        for (idx_t iq = 0; iq < nq; iq++) {
+            std::cout << iq << ": ";
+            for (auto j = result.lims[iq]; j < result.lims[iq + 1]; j++) {
+                std::cout << result.labels[j] << " " << result.distances[j]
+                          << " | ";
+            }
+            std::cout << std::endl;
+        }
+
+    } catch (FaissException& e) {
+        std::cerr << e.what() << '\n';
+    } catch (std::exception& e) {
+        std::cerr << e.what() << '\n';
+    } catch (...) {
+        std::cerr << "Unrecognized exception!\n";
+    }
+    return 0;
+}
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 27701586c8..1b0860f3fb 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -147,6 +147,7 @@ set(FAISS_HEADERS
   index_io.h
   impl/AdditiveQuantizer.h
   impl/AuxIndexStructures.h
+  impl/CodePacker.h
   impl/IDSelector.h
   impl/DistanceComputer.h
   impl/FaissAssert.h
@@ -182,6 +183,7 @@ set(FAISS_HEADERS
   invlists/InvertedLists.h
   invlists/InvertedListsIOHook.h
   utils/AlignedTable.h
+  utils/bf16.h
   utils/Heap.h
   utils/WorkerThread.h
   utils/distances.h
@@ -189,6 +191,7 @@ set(FAISS_HEADERS
   utils/extra_distances.h
   utils/fp16-fp16c.h
   utils/fp16-inl.h
+  utils/fp16-arm.h
   utils/fp16.h
   utils/hamming-inl.h
   utils/hamming.h
@@ -197,6 +200,7 @@ set(FAISS_HEADERS
   utils/prefetch.h
   utils/quantize_lut.h
   utils/random.h
+  utils/sorting.h
   utils/simdlib.h
   utils/simdlib_avx2.h
   utils/simdlib_emulated.h
@@ -229,7 +233,7 @@ set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
 add_library(faiss ${FAISS_SRC})
 
 add_library(faiss_avx2 ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx2")
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512")
   set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 if(NOT WIN32)
@@ -287,7 +291,10 @@ if(WIN32)
   target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB)
 endif()
 
-target_compile_definitions(faiss PRIVATE FINTEGER=int)
+string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx)
+if (${finteger_idx} EQUAL -1)
+  target_compile_definitions(faiss PRIVATE FINTEGER=int)
+endif()
 target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
 target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int)
 
@@ -328,7 +335,7 @@ if(FAISS_OPT_LEVEL STREQUAL "avx2")
   )
 endif()
 if(FAISS_OPT_LEVEL STREQUAL "avx512")
-  install(TARGETS faiss_avx512
+  install(TARGETS faiss_avx2 faiss_avx512
     EXPORT faiss-targets
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp
index 1fd33bb2e7..31955bd531 100644
--- a/faiss/Clustering.cpp
+++ b/faiss/Clustering.cpp
@@ -250,7 +250,7 @@ int split_clusters(
     return nsplit;
 }
 
-}; // namespace
+} // namespace
 
 void Clustering::train_encoded(
         idx_t nx,
@@ -617,7 +617,7 @@ void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
     }
 }
 
-}; // namespace
+} // namespace
 
 void ProgressiveDimClustering::train(
         idx_t n,
diff --git a/faiss/IVFlib.cpp b/faiss/IVFlib.cpp
index 91aa7af7f3..f2c975f4de 100644
--- a/faiss/IVFlib.cpp
+++ b/faiss/IVFlib.cpp
@@ -352,7 +352,10 @@ void search_with_parameters(
     const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
     FAISS_THROW_IF_NOT(index_ivf);
 
-    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
+    SearchParameters* quantizer_params =
+            (params) ? params->quantizer_params : nullptr;
+    index_ivf->quantizer->search(
+            n, x, params->nprobe, Dq.data(), Iq.data(), quantizer_params);
 
     if (nb_dis_ptr) {
         *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
diff --git a/faiss/Index.h b/faiss/Index.h
index 4b4b302b47..3d1bdb996a 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -17,8 +17,8 @@
 #include <typeinfo>
 
 #define FAISS_VERSION_MAJOR 1
-#define FAISS_VERSION_MINOR 7
-#define FAISS_VERSION_PATCH 4
+#define FAISS_VERSION_MINOR 8
+#define FAISS_VERSION_PATCH 0
 
 /**
  * @namespace faiss
diff --git a/faiss/IndexAdditiveQuantizer.cpp b/faiss/IndexAdditiveQuantizer.cpp
index 5bf06c4a4a..719dcafbc9 100644
--- a/faiss/IndexAdditiveQuantizer.cpp
+++ b/faiss/IndexAdditiveQuantizer.cpp
@@ -114,18 +114,19 @@ struct AQDistanceComputerLUT : FlatCodesDistanceComputer {
  * scanning implementation for search
  ************************************************************/
 
-template <class VectorDistance, class ResultHandler>
+template <class VectorDistance, class BlockResultHandler>
 void search_with_decompress(
         const IndexAdditiveQuantizer& ir,
         const float* xq,
         VectorDistance& vd,
-        ResultHandler& res) {
+        BlockResultHandler& res) {
     const uint8_t* codes = ir.codes.data();
     size_t ntotal = ir.ntotal;
     size_t code_size = ir.code_size;
     const AdditiveQuantizer* aq = ir.aq;
 
-    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    using SingleResultHandler =
+            typename BlockResultHandler::SingleResultHandler;
 
 #pragma omp parallel for if (res.nq > 100)
     for (int64_t q = 0; q < res.nq; q++) {
@@ -142,11 +143,14 @@ void search_with_decompress(
     }
 }
 
-template <bool is_IP, AdditiveQuantizer::Search_type_t st, class ResultHandler>
+template <
+        bool is_IP,
+        AdditiveQuantizer::Search_type_t st,
+        class BlockResultHandler>
 void search_with_LUT(
         const IndexAdditiveQuantizer& ir,
         const float* xq,
-        ResultHandler& res) {
+        BlockResultHandler& res) {
     const AdditiveQuantizer& aq = *ir.aq;
     const uint8_t* codes = ir.codes.data();
     size_t ntotal = ir.ntotal;
@@ -154,7 +158,8 @@ void search_with_LUT(
     size_t nq = res.nq;
     size_t d = ir.d;
 
-    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    using SingleResultHandler =
+            typename BlockResultHandler::SingleResultHandler;
     std::unique_ptr<float[]> LUT(new float[nq * aq.total_codebook_size]);
 
     aq.compute_LUT(nq, xq, LUT.get());
@@ -241,21 +246,23 @@ void IndexAdditiveQuantizer::search(
         if (metric_type == METRIC_L2) {
             using VD = VectorDistance<METRIC_L2>;
             VD vd = {size_t(d), metric_arg};
-            HeapResultHandler<VD::C> rh(n, distances, labels, k);
+            HeapBlockResultHandler<VD::C> rh(n, distances, labels, k);
             search_with_decompress(*this, x, vd, rh);
         } else if (metric_type == METRIC_INNER_PRODUCT) {
             using VD = VectorDistance<METRIC_INNER_PRODUCT>;
             VD vd = {size_t(d), metric_arg};
-            HeapResultHandler<VD::C> rh(n, distances, labels, k);
+            HeapBlockResultHandler<VD::C> rh(n, distances, labels, k);
             search_with_decompress(*this, x, vd, rh);
         }
     } else {
         if (metric_type == METRIC_INNER_PRODUCT) {
-            HeapResultHandler<CMin<float, idx_t>> rh(n, distances, labels, k);
+            HeapBlockResultHandler<CMin<float, idx_t>> rh(
+                    n, distances, labels, k);
             search_with_LUT<true, AdditiveQuantizer::ST_LUT_nonorm>(
                     *this, x, rh);
         } else {
-            HeapResultHandler<CMax<float, idx_t>> rh(n, distances, labels, k);
+            HeapBlockResultHandler<CMax<float, idx_t>> rh(
+                    n, distances, labels, k);
             switch (aq->search_type) {
 #define DISPATCH(st)                                                 \
     case AdditiveQuantizer::st:                                      \
diff --git a/faiss/IndexAdditiveQuantizerFastScan.cpp b/faiss/IndexAdditiveQuantizerFastScan.cpp
index 709ccc87e2..1ad4d60926 100644
--- a/faiss/IndexAdditiveQuantizerFastScan.cpp
+++ b/faiss/IndexAdditiveQuantizerFastScan.cpp
@@ -35,30 +35,30 @@ IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan(
 }
 
 void IndexAdditiveQuantizerFastScan::init(
-        AdditiveQuantizer* aq,
+        AdditiveQuantizer* aq_2,
         MetricType metric,
         int bbs) {
-    FAISS_THROW_IF_NOT(aq != nullptr);
-    FAISS_THROW_IF_NOT(!aq->nbits.empty());
-    FAISS_THROW_IF_NOT(aq->nbits[0] == 4);
+    FAISS_THROW_IF_NOT(aq_2 != nullptr);
+    FAISS_THROW_IF_NOT(!aq_2->nbits.empty());
+    FAISS_THROW_IF_NOT(aq_2->nbits[0] == 4);
     if (metric == METRIC_INNER_PRODUCT) {
         FAISS_THROW_IF_NOT_MSG(
-                aq->search_type == AdditiveQuantizer::ST_LUT_nonorm,
+                aq_2->search_type == AdditiveQuantizer::ST_LUT_nonorm,
                 "Search type must be ST_LUT_nonorm for IP metric");
     } else {
         FAISS_THROW_IF_NOT_MSG(
-                aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
-                        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4,
+                aq_2->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+                        aq_2->search_type == AdditiveQuantizer::ST_norm_rq2x4,
                 "Search type must be lsq2x4 or rq2x4 for L2 metric");
     }
 
-    this->aq = aq;
+    this->aq = aq_2;
     if (metric == METRIC_L2) {
-        M = aq->M + 2; // 2x4 bits AQ
+        M = aq_2->M + 2; // 2x4 bits AQ
     } else {
-        M = aq->M;
+        M = aq_2->M;
     }
-    init_fastscan(aq->d, M, 4, metric, bbs);
+    init_fastscan(aq_2->d, M, 4, metric, bbs);
 
     max_train_points = 1024 * ksub * M;
 }
@@ -203,9 +203,9 @@ void IndexAdditiveQuantizerFastScan::search(
 
     NormTableScaler scaler(norm_scale);
     if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, scaler);
+        search_dispatch_implem<true>(n, x, k, distances, labels, &scaler);
     } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, scaler);
+        search_dispatch_implem<false>(n, x, k, distances, labels, &scaler);
     }
 }
 
diff --git a/faiss/IndexBinaryHNSW.cpp b/faiss/IndexBinaryHNSW.cpp
index e6fda8e4bf..f1bda08fbc 100644
--- a/faiss/IndexBinaryHNSW.cpp
+++ b/faiss/IndexBinaryHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexBinaryHNSW.h>
 
 #include <omp.h>
@@ -28,6 +26,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/random.h>
@@ -201,27 +200,31 @@ void IndexBinaryHNSW::search(
             !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
+    // we use the buffer for distances as float but convert them back
+    // to int in the end
+    float* distances_f = (float*)distances;
+
+    using RH = HeapBlockResultHandler<HNSW::C>;
+    RH bres(n, distances_f, labels, k);
+
 #pragma omp parallel
     {
         VisitedTable vt(ntotal);
         std::unique_ptr<DistanceComputer> dis(get_distance_computer());
+        RH::SingleResultHandler res(bres);
 
 #pragma omp for
         for (idx_t i = 0; i < n; i++) {
-            idx_t* idxi = labels + i * k;
-            float* simi = (float*)(distances + i * k);
-
+            res.begin(i);
             dis->set_query((float*)(x + i * code_size));
-
-            maxheap_heapify(k, simi, idxi);
-            hnsw.search(*dis, k, idxi, simi, vt);
-            maxheap_reorder(k, simi, idxi);
+            hnsw.search(*dis, res, vt);
+            res.end();
         }
     }
 
 #pragma omp parallel for
     for (int i = 0; i < n * k; ++i) {
-        distances[i] = std::round(((float*)distances)[i]);
+        distances[i] = std::round(distances_f[i]);
     }
 }
 
diff --git a/faiss/IndexBinaryHash.cpp b/faiss/IndexBinaryHash.cpp
index 86a6d52ded..aca332d8d8 100644
--- a/faiss/IndexBinaryHash.cpp
+++ b/faiss/IndexBinaryHash.cpp
@@ -342,7 +342,6 @@ static void verify_shortlist(
         const std::unordered_set<idx_t>& shortlist,
         SearchResults& res) {
     size_t code_size = index->code_size;
-    size_t nlist = 0, ndis = 0, n0 = 0;
 
     HammingComputer hc(q, code_size);
     const uint8_t* codes = index->xb.data();
diff --git a/faiss/IndexBinaryIVF.cpp b/faiss/IndexBinaryIVF.cpp
index f49127f6b2..ab1b9fd89a 100644
--- a/faiss/IndexBinaryIVF.cpp
+++ b/faiss/IndexBinaryIVF.cpp
@@ -119,16 +119,16 @@ void IndexBinaryIVF::search(
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
-    const size_t nprobe = std::min(nlist, this->nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    const size_t nprobe_2 = std::min(nlist, this->nprobe);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
 
     double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
     indexIVF_stats.quantization_time += getmillisecs() - t0;
 
     t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe);
+    invlists->prefetch_lists(idx.get(), n * nprobe_2);
 
     search_preassigned(
             n, x, k, idx.get(), coarse_dis.get(), distances, labels, false);
@@ -169,16 +169,16 @@ void IndexBinaryIVF::search_and_reconstruct(
         const SearchParameters* params) const {
     FAISS_THROW_IF_NOT_MSG(
             !params, "search params not supported for this index");
-    const size_t nprobe = std::min(nlist, this->nprobe);
+    const size_t nprobe_2 = std::min(nlist, this->nprobe);
     FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0);
+    FAISS_THROW_IF_NOT(nprobe_2 > 0);
 
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
 
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
 
-    invlists->prefetch_lists(idx.get(), n * nprobe);
+    invlists->prefetch_lists(idx.get(), n * nprobe_2);
 
     // search_preassigned() with `store_pairs` enabled to obtain the list_no
     // and offset into `codes` for reconstruction
@@ -321,8 +321,8 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
     }
 
     idx_t list_no;
-    void set_list(idx_t list_no, uint8_t /* coarse_dis */) override {
-        this->list_no = list_no;
+    void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override {
+        this->list_no = list_no_2;
     }
 
     uint32_t distance_to_code(const uint8_t* code) const override {
@@ -357,7 +357,6 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
             const idx_t* __restrict ids,
             int radius,
             RangeQueryResult& result) const override {
-        size_t nup = 0;
         for (size_t j = 0; j < n; j++) {
             uint32_t dis = hc.hamming(codes);
             if (dis < radius) {
@@ -457,7 +456,7 @@ void search_knn_hamming_heap(
             }
 
         } // parallel for
-    }     // parallel
+    } // parallel
 
     indexIVF_stats.nq += n;
     indexIVF_stats.nlist += nlistv;
@@ -651,7 +650,6 @@ void search_knn_hamming_per_invlist(
     idx_t max_codes = params ? params->max_codes : ivf->max_codes;
     FAISS_THROW_IF_NOT(max_codes == 0);
     FAISS_THROW_IF_NOT(!store_pairs);
-    MetricType metric_type = ivf->metric_type;
 
     // reorder buckets
     std::vector<int64_t> lims(n + 1);
@@ -812,16 +810,16 @@ void IndexBinaryIVF::range_search(
         const SearchParameters* params) const {
     FAISS_THROW_IF_NOT_MSG(
             !params, "search params not supported for this index");
-    const size_t nprobe = std::min(nlist, this->nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    const size_t nprobe_2 = std::min(nlist, this->nprobe);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe_2]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe_2]);
 
     double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(n, x, nprobe_2, coarse_dis.get(), idx.get());
     indexIVF_stats.quantization_time += getmillisecs() - t0;
 
     t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe);
+    invlists->prefetch_lists(idx.get(), n * nprobe_2);
 
     range_search_preassigned(n, x, radius, idx.get(), coarse_dis.get(), res);
 
@@ -835,7 +833,7 @@ void IndexBinaryIVF::range_search_preassigned(
         const idx_t* __restrict assign,
         const int32_t* __restrict centroid_dis,
         RangeSearchResult* __restrict res) const {
-    const size_t nprobe = std::min(nlist, this->nprobe);
+    const size_t nprobe_2 = std::min(nlist, this->nprobe);
     bool store_pairs = false;
     size_t nlistv = 0, ndis = 0;
 
@@ -851,7 +849,7 @@ void IndexBinaryIVF::range_search_preassigned(
         all_pres[omp_get_thread_num()] = &pres;
 
         auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
-            idx_t key = assign[i * nprobe + ik]; /* select the list  */
+            idx_t key = assign[i * nprobe_2 + ik]; /* select the list  */
             if (key < 0)
                 return;
             FAISS_THROW_IF_NOT_FMT(
@@ -868,7 +866,7 @@ void IndexBinaryIVF::range_search_preassigned(
             InvertedLists::ScopedCodes scodes(invlists, key);
             InvertedLists::ScopedIds ids(invlists, key);
 
-            scanner->set_list(key, assign[i * nprobe + ik]);
+            scanner->set_list(key, assign[i * nprobe_2 + ik]);
             nlistv++;
             ndis += list_size;
             scanner->scan_codes_range(
@@ -881,7 +879,7 @@ void IndexBinaryIVF::range_search_preassigned(
 
             RangeQueryResult& qres = pres.new_result(i);
 
-            for (size_t ik = 0; ik < nprobe; ik++) {
+            for (size_t ik = 0; ik < nprobe_2; ik++) {
                 scan_list_func(i, ik, qres);
             }
         }
diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp
index 02840767d1..529465da3e 100644
--- a/faiss/IndexFastScan.cpp
+++ b/faiss/IndexFastScan.cpp
@@ -37,22 +37,22 @@ inline size_t roundup(size_t a, size_t b) {
 
 void IndexFastScan::init_fastscan(
         int d,
-        size_t M,
-        size_t nbits,
+        size_t M_2,
+        size_t nbits_2,
         MetricType metric,
         int bbs) {
-    FAISS_THROW_IF_NOT(nbits == 4);
+    FAISS_THROW_IF_NOT(nbits_2 == 4);
     FAISS_THROW_IF_NOT(bbs % 32 == 0);
     this->d = d;
-    this->M = M;
-    this->nbits = nbits;
+    this->M = M_2;
+    this->nbits = nbits_2;
     this->metric_type = metric;
     this->bbs = bbs;
-    ksub = (1 << nbits);
+    ksub = (1 << nbits_2);
 
-    code_size = (M * nbits + 7) / 8;
+    code_size = (M_2 * nbits_2 + 7) / 8;
     ntotal = ntotal2 = 0;
-    M2 = roundup(M, 2);
+    M2 = roundup(M_2, 2);
     is_trained = false;
 }
 
@@ -158,7 +158,7 @@ void IndexFastScan::merge_from(Index& otherIndex, idx_t add_id) {
 
 namespace {
 
-template <class C, typename dis_t, class Scaler>
+template <class C, typename dis_t>
 void estimators_from_tables_generic(
         const IndexFastScan& index,
         const uint8_t* codes,
@@ -167,23 +167,27 @@ void estimators_from_tables_generic(
         size_t k,
         typename C::T* heap_dis,
         int64_t* heap_ids,
-        const Scaler& scaler) {
+        const NormTableScaler* scaler) {
     using accu_t = typename C::T;
 
     for (size_t j = 0; j < ncodes; ++j) {
         BitstringReader bsr(codes + j * index.code_size, index.code_size);
         accu_t dis = 0;
         const dis_t* dt = dis_table;
-        for (size_t m = 0; m < index.M - scaler.nscale; m++) {
+        int nscale = scaler ? scaler->nscale : 0;
+
+        for (size_t m = 0; m < index.M - nscale; m++) {
             uint64_t c = bsr.read(index.nbits);
             dis += dt[c];
             dt += index.ksub;
         }
 
-        for (size_t m = 0; m < scaler.nscale; m++) {
-            uint64_t c = bsr.read(index.nbits);
-            dis += scaler.scale_one(dt[c]);
-            dt += index.ksub;
+        if (nscale) {
+            for (size_t m = 0; m < nscale; m++) {
+                uint64_t c = bsr.read(index.nbits);
+                dis += scaler->scale_one(dt[c]);
+                dt += index.ksub;
+            }
         }
 
         if (C::cmp(heap_dis[0], dis)) {
@@ -193,6 +197,28 @@ void estimators_from_tables_generic(
     }
 }
 
+template <class C>
+ResultHandlerCompare<C, false>* make_knn_handler(
+        int impl,
+        idx_t n,
+        idx_t k,
+        size_t ntotal,
+        float* distances,
+        idx_t* labels,
+        const IDSelector* sel = nullptr) {
+    using HeapHC = HeapHandler<C, false>;
+    using ReservoirHC = ReservoirHandler<C, false>;
+    using SingleResultHC = SingleResultHandler<C, false>;
+
+    if (k == 1) {
+        return new SingleResultHC(n, ntotal, distances, labels, sel);
+    } else if (impl % 2 == 0) {
+        return new HeapHC(n, ntotal, k, distances, labels, sel);
+    } else /* if (impl % 2 == 1) */ {
+        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels, sel);
+    }
+}
+
 } // anonymous namespace
 
 using namespace quantize_lut;
@@ -241,22 +267,21 @@ void IndexFastScan::search(
             !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
-    DummyScaler scaler;
     if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, scaler);
+        search_dispatch_implem<true>(n, x, k, distances, labels, nullptr);
     } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, scaler);
+        search_dispatch_implem<false>(n, x, k, distances, labels, nullptr);
     }
 }
 
-template <bool is_max, class Scaler>
+template <bool is_max>
 void IndexFastScan::search_dispatch_implem(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler) const {
     using Cfloat = typename std::conditional<
             is_max,
             CMax<float, int64_t>,
@@ -319,14 +344,14 @@ void IndexFastScan::search_dispatch_implem(
     }
 }
 
-template <class Cfloat, class Scaler>
+template <class Cfloat>
 void IndexFastScan::search_implem_234(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler) const {
     FAISS_THROW_IF_NOT(implem == 2 || implem == 3 || implem == 4);
 
     const size_t dim12 = ksub * M;
@@ -378,7 +403,7 @@ void IndexFastScan::search_implem_234(
     }
 }
 
-template <class C, class Scaler>
+template <class C>
 void IndexFastScan::search_implem_12(
         idx_t n,
         const float* x,
@@ -386,7 +411,8 @@ void IndexFastScan::search_implem_12(
         float* distances,
         idx_t* labels,
         int impl,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler) const {
+    using RH = ResultHandlerCompare<C, false>;
     FAISS_THROW_IF_NOT(bbs == 32);
 
     // handle qbs2 blocking by recursive call
@@ -432,63 +458,31 @@ void IndexFastScan::search_implem_12(
             pq4_pack_LUT_qbs(qbs, M2, quantized_dis_tables.get(), LUT.get());
     FAISS_THROW_IF_NOT(LUT_nq == n);
 
-    if (k == 1) {
-        SingleResultHandler<C> handler(n, ntotal);
-        if (skip & 4) {
-            // pass
-        } else {
-            handler.disable = bool(skip & 2);
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
-        }
+    std::unique_ptr<RH> handler(
+            make_knn_handler<C>(impl, n, k, ntotal, distances, labels));
+    handler->disable = bool(skip & 2);
+    handler->normalizers = normalizers.get();
 
-        handler.to_flat_arrays(distances, labels, normalizers.get());
-
-    } else if (impl == 12) {
-        std::vector<uint16_t> tmp_dis(n * k);
-        std::vector<int32_t> tmp_ids(n * k);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            HeapHandler<C> handler(
-                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
-            handler.disable = bool(skip & 2);
-
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
-
-            if (!(skip & 8)) {
-                handler.to_flat_arrays(distances, labels, normalizers.get());
-            }
-        }
-
-    } else { // impl == 13
-
-        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
-        handler.disable = bool(skip & 2);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
-        }
-
-        if (!(skip & 8)) {
-            handler.to_flat_arrays(distances, labels, normalizers.get());
-        }
-
-        FastScan_stats.t0 += handler.times[0];
-        FastScan_stats.t1 += handler.times[1];
-        FastScan_stats.t2 += handler.times[2];
-        FastScan_stats.t3 += handler.times[3];
+    if (skip & 4) {
+        // pass
+    } else {
+        pq4_accumulate_loop_qbs(
+                qbs,
+                ntotal2,
+                M2,
+                codes.get(),
+                LUT.get(),
+                *handler.get(),
+                scaler);
+    }
+    if (!(skip & 8)) {
+        handler->end();
     }
 }
 
 FastScanStats FastScan_stats;
 
-template <class C, class Scaler>
+template <class C>
 void IndexFastScan::search_implem_14(
         idx_t n,
         const float* x,
@@ -496,7 +490,8 @@ void IndexFastScan::search_implem_14(
         float* distances,
         idx_t* labels,
         int impl,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler) const {
+    using RH = ResultHandlerCompare<C, false>;
     FAISS_THROW_IF_NOT(bbs % 32 == 0);
 
     int qbs2 = qbs == 0 ? 4 : qbs;
@@ -531,91 +526,29 @@ void IndexFastScan::search_implem_14(
     AlignedTable<uint8_t> LUT(n * dim12);
     pq4_pack_LUT(n, M2, quantized_dis_tables.get(), LUT.get());
 
-    if (k == 1) {
-        SingleResultHandler<C> handler(n, ntotal);
-        if (skip & 4) {
-            // pass
-        } else {
-            handler.disable = bool(skip & 2);
-            pq4_accumulate_loop(
-                    n,
-                    ntotal2,
-                    bbs,
-                    M2,
-                    codes.get(),
-                    LUT.get(),
-                    handler,
-                    scaler);
-        }
-        handler.to_flat_arrays(distances, labels, normalizers.get());
-
-    } else if (impl == 14) {
-        std::vector<uint16_t> tmp_dis(n * k);
-        std::vector<int32_t> tmp_ids(n * k);
-
-        if (skip & 4) {
-            // skip
-        } else if (k > 1) {
-            HeapHandler<C> handler(
-                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
-            handler.disable = bool(skip & 2);
-
-            pq4_accumulate_loop(
-                    n,
-                    ntotal2,
-                    bbs,
-                    M2,
-                    codes.get(),
-                    LUT.get(),
-                    handler,
-                    scaler);
-
-            if (!(skip & 8)) {
-                handler.to_flat_arrays(distances, labels, normalizers.get());
-            }
-        }
-
-    } else { // impl == 15
-
-        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
-        handler.disable = bool(skip & 2);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            pq4_accumulate_loop(
-                    n,
-                    ntotal2,
-                    bbs,
-                    M2,
-                    codes.get(),
-                    LUT.get(),
-                    handler,
-                    scaler);
-        }
+    std::unique_ptr<RH> handler(
+            make_knn_handler<C>(impl, n, k, ntotal, distances, labels));
+    handler->disable = bool(skip & 2);
+    handler->normalizers = normalizers.get();
 
-        if (!(skip & 8)) {
-            handler.to_flat_arrays(distances, labels, normalizers.get());
-        }
+    if (skip & 4) {
+        // pass
+    } else {
+        pq4_accumulate_loop(
+                n,
+                ntotal2,
+                bbs,
+                M2,
+                codes.get(),
+                LUT.get(),
+                *handler.get(),
+                scaler);
+    }
+    if (!(skip & 8)) {
+        handler->end();
     }
 }
 
-template void IndexFastScan::search_dispatch_implem<true, NormTableScaler>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler& scaler) const;
-
-template void IndexFastScan::search_dispatch_implem<false, NormTableScaler>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const NormTableScaler& scaler) const;
-
 void IndexFastScan::reconstruct(idx_t key, float* recons) const {
     std::vector<uint8_t> code(code_size, 0);
     BitstringWriter bsw(code.data(), code_size);
diff --git a/faiss/IndexFastScan.h b/faiss/IndexFastScan.h
index 19aad2a8ee..3c89dcf928 100644
--- a/faiss/IndexFastScan.h
+++ b/faiss/IndexFastScan.h
@@ -13,6 +13,7 @@
 namespace faiss {
 
 struct CodePacker;
+struct NormTableScaler;
 
 /** Fast scan version of IndexPQ and IndexAQ. Works for 4-bit PQ and AQ for now.
  *
@@ -87,25 +88,25 @@ struct IndexFastScan : Index {
             uint8_t* lut,
             float* normalizers) const;
 
-    template <bool is_max, class Scaler>
+    template <bool is_max>
     void search_dispatch_implem(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler) const;
 
-    template <class Cfloat, class Scaler>
+    template <class Cfloat>
     void search_implem_234(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler) const;
 
-    template <class C, class Scaler>
+    template <class C>
     void search_implem_12(
             idx_t n,
             const float* x,
@@ -113,9 +114,9 @@ struct IndexFastScan : Index {
             float* distances,
             idx_t* labels,
             int impl,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler) const;
 
-    template <class C, class Scaler>
+    template <class C>
     void search_implem_14(
             idx_t n,
             const float* x,
@@ -123,7 +124,7 @@ struct IndexFastScan : Index {
             float* distances,
             idx_t* labels,
             int impl,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler) const;
 
     void reconstruct(idx_t key, float* recons) const override;
     size_t remove_ids(const IDSelector& sel) override;
diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index f606f8e621..7d29ca5387 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -41,15 +41,19 @@ void IndexFlat::search(
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
-    } else if (is_similarity_metric(metric_type)) {
-        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
-        FAISS_THROW_IF_NOT(!sel);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        FAISS_THROW_IF_NOT(!sel); // TODO implement with selector
         knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+                x,
+                get_xb(),
+                d,
+                n,
+                ntotal,
+                metric_type,
+                metric_arg,
+                k,
+                distances,
+                labels);
     }
 }
 
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 8c0e0afde8..8e5c654f04 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -17,7 +15,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <limits>
+#include <memory>
 #include <queue>
+#include <random>
 #include <unordered_set>
 
 #include <sys/stat.h>
@@ -29,7 +30,7 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/random.h>
 #include <faiss/utils/sorting.h>
@@ -68,52 +69,6 @@ HNSWStats hnsw_stats;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        basedis->distances_batch_4(
-                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
-        dis0 = -dis0;
-        dis1 = -dis1;
-        dis2 = -dis2;
-        dis3 = -dis3;
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    virtual ~NegativeDistanceComputer() {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
@@ -192,7 +147,9 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+        for (int pt_level = hist.size() - 1;
+             pt_level >= !index_hnsw.init_level0;
+             pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -228,7 +185,13 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    hnsw.add_with_locks(
+                            *dis,
+                            pt_level,
+                            pt_id,
+                            locks,
+                            vt,
+                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -248,7 +211,11 @@ void hnsw_add_vertices(
             }
             i1 = i0;
         }
-        FAISS_ASSERT(i1 == 0);
+        if (index_hnsw.init_level0) {
+            FAISS_ASSERT(i1 == 0);
+        } else {
+            FAISS_ASSERT((i1 - hist[0]) == 0);
+        }
     }
     if (verbose) {
         printf("Done in %.3f ms\n", getmillisecs() - t0);
@@ -286,18 +253,21 @@ void IndexHNSW::train(idx_t n, const float* x) {
     is_trained = true;
 }
 
-void IndexHNSW::search(
+namespace {
+
+template <class BlockResultHandler>
+void hnsw_search(
+        const IndexHNSW* index,
         idx_t n,
         const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    FAISS_THROW_IF_NOT(k > 0);
+        BlockResultHandler& bres,
+        const SearchParameters* params_in) {
     FAISS_THROW_IF_NOT_MSG(
-            storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+            index->storage,
+            "No storage index, please use IndexHNSWFlat (or variants) "
+            "instead of IndexHNSW directly");
     const SearchParametersHNSW* params = nullptr;
+    const HNSW& hnsw = index->hnsw;
 
     int efSearch = hnsw.efSearch;
     if (params_in) {
@@ -305,63 +275,81 @@ void IndexHNSW::search(
         FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
         efSearch = params->efSearch;
     }
-    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+    size_t n1 = 0, n2 = 0, ndis = 0;
 
-    idx_t check_period =
-            InterruptCallback::get_period_hint(hnsw.max_level * d * efSearch);
+    idx_t check_period = InterruptCallback::get_period_hint(
+            hnsw.max_level * index->d * efSearch);
 
     for (idx_t i0 = 0; i0 < n; i0 += check_period) {
         idx_t i1 = std::min(i0 + check_period, n);
 
 #pragma omp parallel
         {
-            VisitedTable vt(ntotal);
+            VisitedTable vt(index->ntotal);
+            typename BlockResultHandler::SingleResultHandler res(bres);
 
             std::unique_ptr<DistanceComputer> dis(
-                    storage_distance_computer(storage));
+                    storage_distance_computer(index->storage));
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) schedule(guided)
+#pragma omp for reduction(+ : n1, n2, ndis) schedule(guided)
             for (idx_t i = i0; i < i1; i++) {
-                idx_t* idxi = labels + i * k;
-                float* simi = distances + i * k;
-                dis->set_query(x + i * d);
+                res.begin(i);
+                dis->set_query(x + i * index->d);
 
-                maxheap_heapify(k, simi, idxi);
-                HNSWStats stats = hnsw.search(*dis, k, idxi, simi, vt, params);
+                HNSWStats stats = hnsw.search(*dis, res, vt, params);
                 n1 += stats.n1;
                 n2 += stats.n2;
-                n3 += stats.n3;
                 ndis += stats.ndis;
-                nreorder += stats.nreorder;
-                maxheap_reorder(k, simi, idxi);
-
-                if (reconstruct_from_neighbors &&
-                    reconstruct_from_neighbors->k_reorder != 0) {
-                    int k_reorder = reconstruct_from_neighbors->k_reorder;
-                    if (k_reorder == -1 || k_reorder > k)
-                        k_reorder = k;
-
-                    nreorder += reconstruct_from_neighbors->compute_distances(
-                            k_reorder, idxi, x + i * d, simi);
-
-                    // sort top k_reorder
-                    maxheap_heapify(
-                            k_reorder, simi, idxi, simi, idxi, k_reorder);
-                    maxheap_reorder(k_reorder, simi, idxi);
-                }
+                res.end();
             }
         }
         InterruptCallback::check();
     }
 
-    if (is_similarity_metric(metric_type)) {
+    hnsw_stats.combine({n1, n2, ndis});
+}
+
+} // anonymous namespace
+
+void IndexHNSW::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    FAISS_THROW_IF_NOT(k > 0);
+
+    using RH = HeapBlockResultHandler<HNSW::C>;
+    RH bres(n, distances, labels, k);
+
+    hnsw_search(this, n, x, bres, params_in);
+
+    if (is_similarity_metric(this->metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
         }
     }
+}
+
+void IndexHNSW::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params) const {
+    using RH = RangeSearchBlockResultHandler<HNSW::C>;
+    RH bres(result, radius);
+
+    hnsw_search(this, n, x, bres, params);
 
-    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+    if (is_similarity_metric(this->metric_type)) {
+        // we need to revert the negated distances
+        for (size_t i = 0; i < result->lims[result->nq]; i++) {
+            result->distances[i] = -result->distances[i];
+        }
+    }
 }
 
 void IndexHNSW::add(idx_t n, const float* x) {
@@ -431,45 +419,59 @@ void IndexHNSW::search_level_0(
         float* distances,
         idx_t* labels,
         int nprobe,
-        int search_type) const {
+        int search_type,
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const SearchParametersHNSW* params = nullptr;
+
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+    }
+
     storage_idx_t ntotal = hnsw.levels.size();
 
+    using RH = HeapBlockResultHandler<HNSW::C>;
+    RH bres(n, distances, labels, k);
+
 #pragma omp parallel
     {
         std::unique_ptr<DistanceComputer> qdis(
                 storage_distance_computer(storage));
         HNSWStats search_stats;
         VisitedTable vt(ntotal);
+        RH::SingleResultHandler res(bres);
 
 #pragma omp for
         for (idx_t i = 0; i < n; i++) {
-            idx_t* idxi = labels + i * k;
-            float* simi = distances + i * k;
-
+            res.begin(i);
             qdis->set_query(x + i * d);
-            maxheap_heapify(k, simi, idxi);
 
             hnsw.search_level_0(
                     *qdis.get(),
-                    k,
-                    idxi,
-                    simi,
+                    res,
                     nprobe,
                     nearest + i * nprobe,
                     nearest_d + i * nprobe,
                     search_type,
                     search_stats,
-                    vt);
-
+                    vt,
+                    params);
+            res.end();
             vt.advance();
-            maxheap_reorder(k, simi, idxi);
         }
 #pragma omp critical
         { hnsw_stats.combine(search_stats); }
     }
+    if (is_similarity_metric(this->metric_type)) {
+// we need to revert the negated distances
+#pragma omp parallel for
+        for (int64_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
@@ -630,246 +632,6 @@ void IndexHNSW::permute_entries(const idx_t* perm) {
     hnsw.permute_entries(perm);
 }
 
-/**************************************************************
- * ReconstructFromNeighbors implementation
- **************************************************************/
-
-ReconstructFromNeighbors::ReconstructFromNeighbors(
-        const IndexHNSW& index,
-        size_t k,
-        size_t nsq)
-        : index(index), k(k), nsq(nsq) {
-    M = index.hnsw.nb_neighbors(0);
-    FAISS_ASSERT(k <= 256);
-    code_size = k == 1 ? 0 : nsq;
-    ntotal = 0;
-    d = index.d;
-    FAISS_ASSERT(d % nsq == 0);
-    dsub = d / nsq;
-    k_reorder = -1;
-}
-
-void ReconstructFromNeighbors::reconstruct(
-        storage_idx_t i,
-        float* x,
-        float* tmp) const {
-    const HNSW& hnsw = index.hnsw;
-    size_t begin, end;
-    hnsw.neighbor_range(i, 0, &begin, &end);
-
-    if (k == 1 || nsq == 1) {
-        const float* beta;
-        if (k == 1) {
-            beta = codebook.data();
-        } else {
-            int idx = codes[i];
-            beta = codebook.data() + idx * (M + 1);
-        }
-
-        float w0 = beta[0]; // weight of image itself
-        index.storage->reconstruct(i, tmp);
-
-        for (int l = 0; l < d; l++)
-            x[l] = w0 * tmp[l];
-
-        for (size_t j = begin; j < end; j++) {
-            storage_idx_t ji = hnsw.neighbors[j];
-            if (ji < 0)
-                ji = i;
-            float w = beta[j - begin + 1];
-            index.storage->reconstruct(ji, tmp);
-            for (int l = 0; l < d; l++)
-                x[l] += w * tmp[l];
-        }
-    } else if (nsq == 2) {
-        int idx0 = codes[2 * i];
-        int idx1 = codes[2 * i + 1];
-
-        const float* beta0 = codebook.data() + idx0 * (M + 1);
-        const float* beta1 = codebook.data() + (idx1 + k) * (M + 1);
-
-        index.storage->reconstruct(i, tmp);
-
-        float w0;
-
-        w0 = beta0[0];
-        for (int l = 0; l < dsub; l++)
-            x[l] = w0 * tmp[l];
-
-        w0 = beta1[0];
-        for (int l = dsub; l < d; l++)
-            x[l] = w0 * tmp[l];
-
-        for (size_t j = begin; j < end; j++) {
-            storage_idx_t ji = hnsw.neighbors[j];
-            if (ji < 0)
-                ji = i;
-            index.storage->reconstruct(ji, tmp);
-            float w;
-            w = beta0[j - begin + 1];
-            for (int l = 0; l < dsub; l++)
-                x[l] += w * tmp[l];
-
-            w = beta1[j - begin + 1];
-            for (int l = dsub; l < d; l++)
-                x[l] += w * tmp[l];
-        }
-    } else {
-        std::vector<const float*> betas(nsq);
-        {
-            const float* b = codebook.data();
-            const uint8_t* c = &codes[i * code_size];
-            for (int sq = 0; sq < nsq; sq++) {
-                betas[sq] = b + (*c++) * (M + 1);
-                b += (M + 1) * k;
-            }
-        }
-
-        index.storage->reconstruct(i, tmp);
-        {
-            int d0 = 0;
-            for (int sq = 0; sq < nsq; sq++) {
-                float w = *(betas[sq]++);
-                int d1 = d0 + dsub;
-                for (int l = d0; l < d1; l++) {
-                    x[l] = w * tmp[l];
-                }
-                d0 = d1;
-            }
-        }
-
-        for (size_t j = begin; j < end; j++) {
-            storage_idx_t ji = hnsw.neighbors[j];
-            if (ji < 0)
-                ji = i;
-
-            index.storage->reconstruct(ji, tmp);
-            int d0 = 0;
-            for (int sq = 0; sq < nsq; sq++) {
-                float w = *(betas[sq]++);
-                int d1 = d0 + dsub;
-                for (int l = d0; l < d1; l++) {
-                    x[l] += w * tmp[l];
-                }
-                d0 = d1;
-            }
-        }
-    }
-}
-
-void ReconstructFromNeighbors::reconstruct_n(
-        storage_idx_t n0,
-        storage_idx_t ni,
-        float* x) const {
-#pragma omp parallel
-    {
-        std::vector<float> tmp(index.d);
-#pragma omp for
-        for (storage_idx_t i = 0; i < ni; i++) {
-            reconstruct(n0 + i, x + i * index.d, tmp.data());
-        }
-    }
-}
-
-size_t ReconstructFromNeighbors::compute_distances(
-        size_t n,
-        const idx_t* shortlist,
-        const float* query,
-        float* distances) const {
-    std::vector<float> tmp(2 * index.d);
-    size_t ncomp = 0;
-    for (int i = 0; i < n; i++) {
-        if (shortlist[i] < 0)
-            break;
-        reconstruct(shortlist[i], tmp.data(), tmp.data() + index.d);
-        distances[i] = fvec_L2sqr(query, tmp.data(), index.d);
-        ncomp++;
-    }
-    return ncomp;
-}
-
-void ReconstructFromNeighbors::get_neighbor_table(storage_idx_t i, float* tmp1)
-        const {
-    const HNSW& hnsw = index.hnsw;
-    size_t begin, end;
-    hnsw.neighbor_range(i, 0, &begin, &end);
-    size_t d = index.d;
-
-    index.storage->reconstruct(i, tmp1);
-
-    for (size_t j = begin; j < end; j++) {
-        storage_idx_t ji = hnsw.neighbors[j];
-        if (ji < 0)
-            ji = i;
-        index.storage->reconstruct(ji, tmp1 + (j - begin + 1) * d);
-    }
-}
-
-/// called by add_codes
-void ReconstructFromNeighbors::estimate_code(
-        const float* x,
-        storage_idx_t i,
-        uint8_t* code) const {
-    // fill in tmp table with the neighbor values
-    std::unique_ptr<float[]> tmp1(new float[d * (M + 1) + (d * k)]);
-    float* tmp2 = tmp1.get() + d * (M + 1);
-
-    // collect coordinates of base
-    get_neighbor_table(i, tmp1.get());
-
-    for (size_t sq = 0; sq < nsq; sq++) {
-        int d0 = sq * dsub;
-
-        {
-            FINTEGER ki = k, di = d, m1 = M + 1;
-            FINTEGER dsubi = dsub;
-            float zero = 0, one = 1;
-
-            sgemm_("N",
-                   "N",
-                   &dsubi,
-                   &ki,
-                   &m1,
-                   &one,
-                   tmp1.get() + d0,
-                   &di,
-                   codebook.data() + sq * (m1 * k),
-                   &m1,
-                   &zero,
-                   tmp2,
-                   &dsubi);
-        }
-
-        float min = HUGE_VAL;
-        int argmin = -1;
-        for (size_t j = 0; j < k; j++) {
-            float dis = fvec_L2sqr(x + d0, tmp2 + j * dsub, dsub);
-            if (dis < min) {
-                min = dis;
-                argmin = j;
-            }
-        }
-        code[sq] = argmin;
-    }
-}
-
-void ReconstructFromNeighbors::add_codes(size_t n, const float* x) {
-    if (k == 1) { // nothing to encode
-        ntotal += n;
-        return;
-    }
-    codes.resize(codes.size() + code_size * n);
-#pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-        estimate_code(
-                x + i * index.d,
-                ntotal + i,
-                codes.data() + (ntotal + i) * code_size);
-    }
-    ntotal += n;
-    FAISS_ASSERT(codes.size() == ntotal * code_size);
-}
-
 /**************************************************************
  * IndexHNSWFlat implementation
  **************************************************************/
@@ -953,7 +715,6 @@ int search_from_candidates_2(
         int level,
         int nres_in = 0) {
     int nres = nres_in;
-    int ndis = 0;
     for (int i = 0; i < candidates.size(); i++) {
         idx_t v1 = candidates.ids[i];
         FAISS_ASSERT(v1 >= 0);
@@ -976,7 +737,6 @@ int search_from_candidates_2(
             if (vt.visited[v1] == vt.visno + 1) {
                 // nothing to do
             } else {
-                ndis++;
                 float d = qdis(v1);
                 candidates.push(v1, d);
 
@@ -1022,7 +782,7 @@ void IndexHNSW2Level::search(
         IndexHNSW::search(n, x, k, distances, labels);
 
     } else { // "mixed" search
-        size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+        size_t n1 = 0, n2 = 0, ndis = 0;
 
         const IndexIVFPQ* index_ivfpq =
                 dynamic_cast<const IndexIVFPQ*>(storage);
@@ -1054,7 +814,7 @@ void IndexHNSW2Level::search(
             int candidates_size = hnsw.upper_beam;
             MinimaxHeap candidates(candidates_size);
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+#pragma omp for reduction(+ : n1, n2, ndis)
             for (idx_t i = 0; i < n; i++) {
                 idx_t* idxi = labels + i * k;
                 float* simi = distances + i * k;
@@ -1099,9 +859,7 @@ void IndexHNSW2Level::search(
                         k);
                 n1 += search_stats.n1;
                 n2 += search_stats.n2;
-                n3 += search_stats.n3;
                 ndis += search_stats.ndis;
-                nreorder += search_stats.nreorder;
 
                 vt.advance();
                 vt.advance();
@@ -1110,7 +868,7 @@ void IndexHNSW2Level::search(
             }
         }
 
-        hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+        hnsw_stats.combine({n1, n2, ndis});
     }
 }
 
@@ -1136,4 +894,86 @@ void IndexHNSW2Level::flip_to_ivf() {
     delete storage2l;
 }
 
+/**************************************************************
+ * IndexHNSWCagra implementation
+ **************************************************************/
+
+IndexHNSWCagra::IndexHNSWCagra() {
+    is_trained = true;
+}
+
+IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
+        : IndexHNSW(
+                  (metric == METRIC_L2)
+                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
+                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
+                  M) {
+    FAISS_THROW_IF_NOT_MSG(
+            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
+            "unsupported metric type for IndexHNSWCagra");
+    own_fields = true;
+    is_trained = true;
+    init_level0 = true;
+    keep_max_size_level0 = true;
+}
+
+void IndexHNSWCagra::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            !base_level_only,
+            "Cannot add vectors when base_level_only is set to True");
+
+    IndexHNSW::add(n, x);
+}
+
+void IndexHNSWCagra::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    if (!base_level_only) {
+        IndexHNSW::search(n, x, k, distances, labels, params);
+    } else {
+        std::vector<storage_idx_t> nearest(n);
+        std::vector<float> nearest_d(n);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            std::unique_ptr<DistanceComputer> dis(
+                    storage_distance_computer(this->storage));
+            dis->set_query(x + i * d);
+            nearest[i] = -1;
+            nearest_d[i] = std::numeric_limits<float>::max();
+
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal);
+
+            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
+                auto idx = distrib(gen);
+                auto distance = (*dis)(idx);
+                if (distance < nearest_d[i]) {
+                    nearest[i] = idx;
+                    nearest_d[i] = distance;
+                }
+            }
+            FAISS_THROW_IF_NOT_MSG(
+                    nearest[i] >= 0, "Could not find a valid entrypoint.");
+        }
+
+        search_level_0(
+                n,
+                x,
+                k,
+                nearest.data(),
+                nearest_d.data(),
+                distances,
+                labels,
+                1, // n_probes
+                1, // search_type
+                params);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index 13855d3037..71807c6537 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -21,49 +21,6 @@ namespace faiss {
 
 struct IndexHNSW;
 
-struct ReconstructFromNeighbors {
-    typedef HNSW::storage_idx_t storage_idx_t;
-
-    const IndexHNSW& index;
-    size_t M;   // number of neighbors
-    size_t k;   // number of codebook entries
-    size_t nsq; // number of subvectors
-    size_t code_size;
-    int k_reorder; // nb to reorder. -1 = all
-
-    std::vector<float> codebook; // size nsq * k * (M + 1)
-
-    std::vector<uint8_t> codes; // size ntotal * code_size
-    size_t ntotal;
-    size_t d, dsub; // derived values
-
-    explicit ReconstructFromNeighbors(
-            const IndexHNSW& index,
-            size_t k = 256,
-            size_t nsq = 1);
-
-    /// codes must be added in the correct order and the IndexHNSW
-    /// must be populated and sorted
-    void add_codes(size_t n, const float* x);
-
-    size_t compute_distances(
-            size_t n,
-            const idx_t* shortlist,
-            const float* query,
-            float* distances) const;
-
-    /// called by add_codes
-    void estimate_code(const float* x, storage_idx_t i, uint8_t* code) const;
-
-    /// called by compute_distances
-    void reconstruct(storage_idx_t i, float* x, float* tmp) const;
-
-    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float* x) const;
-
-    /// get the M+1 -by-d table for neighbor coordinates for vector i
-    void get_neighbor_table(storage_idx_t i, float* out) const;
-};
-
 /** The HNSW index is a normal random-access index with a HNSW
  * link structure built on top */
 
@@ -77,7 +34,17 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
-    ReconstructFromNeighbors* reconstruct_from_neighbors = nullptr;
+    // When set to false, level 0 in the knn graph is not initialized.
+    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
+    // as level 0 knn graph is copied over from the index built by
+    // GpuIndexCagra.
+    bool init_level0 = true;
+
+    // When set to true, all neighbors in level 0 are filled up
+    // to the maximum size allowed (2 * M). This option is used by
+    // IndexHHNSWCagra to create a full base layer graph that is
+    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
+    bool keep_max_size_level0 = false;
 
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
@@ -98,6 +65,13 @@ struct IndexHNSW : Index {
             idx_t* labels,
             const SearchParameters* params = nullptr) const override;
 
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            const SearchParameters* params = nullptr) const override;
+
     void reconstruct(idx_t key, float* recons) const override;
 
     void reset() override;
@@ -119,7 +93,8 @@ struct IndexHNSW : Index {
             float* distances,
             idx_t* labels,
             int nprobe = 1,
-            int search_type = 1) const;
+            int search_type = 1,
+            const SearchParameters* params = nullptr) const;
 
     /// alternative graph building
     void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
@@ -186,4 +161,33 @@ struct IndexHNSW2Level : IndexHNSW {
             const SearchParameters* params = nullptr) const override;
 };
 
+struct IndexHNSWCagra : IndexHNSW {
+    IndexHNSWCagra();
+    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+
+    /// When set to true, the index is immutable.
+    /// This option is used to copy the knn graph from GpuIndexCagra
+    /// to the base level of IndexHNSWCagra without adding upper levels.
+    /// Doing so enables to search the HNSW index, but removes the
+    /// ability to add vectors.
+    bool base_level_only = false;
+
+    /// When `base_level_only` is set to `True`, the search function
+    /// searches only the base level knn graph of the HNSW index.
+    /// This parameter selects the entry point by randomly selecting
+    /// some points and using the best one.
+    int num_base_level_search_entrypoints = 32;
+
+    void add(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
 } // namespace faiss
diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp
index 7972bec9a0..dc84052b2f 100644
--- a/faiss/IndexIDMap.cpp
+++ b/faiss/IndexIDMap.cpp
@@ -90,10 +90,10 @@ struct ScopedSelChange {
     SearchParameters* params = nullptr;
     IDSelector* old_sel = nullptr;
 
-    void set(SearchParameters* params, IDSelector* new_sel) {
-        this->params = params;
-        old_sel = params->sel;
-        params->sel = new_sel;
+    void set(SearchParameters* params_2, IDSelector* new_sel) {
+        this->params = params_2;
+        old_sel = params_2->sel;
+        params_2->sel = new_sel;
     }
     ~ScopedSelChange() {
         if (params) {
@@ -146,9 +146,16 @@ void IndexIDMapTemplate<IndexT>::range_search(
         typename IndexT::distance_t radius,
         RangeSearchResult* result,
         const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    index->range_search(n, x, radius, result);
+    if (params) {
+        SearchParameters internal_search_parameters;
+        IDSelectorTranslated id_selector_translated(id_map, params->sel);
+        internal_search_parameters.sel = &id_selector_translated;
+
+        index->range_search(n, x, radius, result, &internal_search_parameters);
+    } else {
+        index->range_search(n, x, radius, result);
+    }
+
 #pragma omp parallel for
     for (idx_t i = 0; i < result->lims[result->nq]; i++) {
         result->labels[i] = result->labels[i] < 0 ? result->labels[i]
@@ -266,7 +273,7 @@ void IndexIDMap2Template<IndexT>::reconstruct(
         typename IndexT::component_t* recons) const {
     try {
         this->index->reconstruct(rev_map.at(key), recons);
-    } catch (const std::out_of_range& e) {
+    } catch (const std::out_of_range&) {
         FAISS_THROW_FMT("key %" PRId64 " not found", key);
     }
 }
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index a1fa8cd16b..548aaa4cc7 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -203,7 +203,8 @@ void IndexIVF::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
     // do some blocking to avoid excessive allocs
     idx_t bs = 65536;
     if (n > bs) {
@@ -218,7 +219,8 @@ void IndexIVF::add_core(
                     i1 - i0,
                     x + i0 * d,
                     xids ? xids + i0 : nullptr,
-                    coarse_idx + i0);
+                    coarse_idx + i0,
+                    inverted_list_context);
         }
         return;
     }
@@ -249,7 +251,10 @@ void IndexIVF::add_core(
             if (list_no >= 0 && list_no % nt == rank) {
                 idx_t id = xids ? xids[i] : ntotal + i;
                 size_t ofs = invlists->add_entry(
-                        list_no, id, flat_codes.get() + i * code_size);
+                        list_no,
+                        id,
+                        flat_codes.get() + i * code_size,
+                        inverted_list_context);
 
                 dm_adder.add(i, list_no, ofs);
 
@@ -439,12 +444,15 @@ void IndexIVF::search_preassigned(
         max_codes = unlimited_list_size;
     }
 
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 0           ? false
                      : pmode == 3 ? n > 1
                      : pmode == 1 ? nprobe > 1
                                   : nprobe * n > 1);
 
+    void* inverted_list_context =
+            params ? params->inverted_list_context : nullptr;
+
 #pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
     {
         std::unique_ptr<InvertedListScanner> scanner(
@@ -507,7 +515,7 @@ void IndexIVF::search_preassigned(
                     nlist);
 
             // don't waste time on empty lists
-            if (invlists->is_empty(key)) {
+            if (invlists->is_empty(key, inverted_list_context)) {
                 return (size_t)0;
             }
 
@@ -520,7 +528,7 @@ void IndexIVF::search_preassigned(
                     size_t list_size = 0;
 
                     std::unique_ptr<InvertedListsIterator> it(
-                            invlists->get_iterator(key));
+                            invlists->get_iterator(key, inverted_list_context));
 
                     nheap += scanner->iterate_codes(
                             it.get(), simi, idxi, k, list_size);
@@ -660,7 +668,6 @@ void IndexIVF::search_preassigned(
 #pragma omp for schedule(dynamic)
             for (int64_t ij = 0; ij < n * nprobe; ij++) {
                 size_t i = ij / nprobe;
-                size_t j = ij % nprobe;
 
                 scanner->set_query(x + i * d);
                 init_result(local_dis.data(), local_idx.data());
@@ -777,12 +784,15 @@ void IndexIVF::range_search_preassigned(
 
     int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
     // don't start parallel section if single query
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 3           ? false
                      : pmode == 0 ? nx > 1
                      : pmode == 1 ? nprobe > 1
                                   : nprobe * nx > 1);
 
+    void* inverted_list_context =
+            params ? params->inverted_list_context : nullptr;
+
 #pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis)
     {
         RangeSearchPartialResult pres(result);
@@ -804,7 +814,7 @@ void IndexIVF::range_search_preassigned(
                     ik,
                     nlist);
 
-            if (invlists->is_empty(key)) {
+            if (invlists->is_empty(key, inverted_list_context)) {
                 return;
             }
 
@@ -813,7 +823,7 @@ void IndexIVF::range_search_preassigned(
                 scanner->set_list(key, coarse_dis[i * nprobe + ik]);
                 if (invlists->use_iterator) {
                     std::unique_ptr<InvertedListsIterator> it(
-                            invlists->get_iterator(key));
+                            invlists->get_iterator(key, inverted_list_context));
 
                     scanner->iterate_codes_range(
                             it.get(), radius, qres, list_size);
diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index d0981caa42..185561d086 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -72,6 +72,8 @@ struct SearchParametersIVF : SearchParameters {
     size_t nprobe = 1;    ///< number of probes at query time
     size_t max_codes = 0; ///< max nb of codes to visit to do a query
     SearchParameters* quantizer_params = nullptr;
+    /// context object to pass to InvertedLists
+    void* inverted_list_context = nullptr;
 
     virtual ~SearchParametersIVF() {}
 };
@@ -232,7 +234,8 @@ struct IndexIVF : Index, IndexIVFInterface {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx);
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr);
 
     /** Encodes a set of vectors as they would appear in the inverted lists
      *
@@ -430,6 +433,14 @@ struct IndexIVF : Index, IndexIVFInterface {
 
     /* The standalone codec interface (except sa_decode that is specific) */
     size_t sa_code_size() const override;
+
+    /** encode a set of vectors
+     * sa_encode will call encode_vector with include_listno=true
+     * @param n      nb of vectors to encode
+     * @param x      the vectors to encode
+     * @param bytes  output array for the codes
+     * @return nb of bytes written to codes
+     */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     IndexIVF();
diff --git a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
index 25c3aa2b06..23a2de554d 100644
--- a/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
+++ b/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
@@ -211,7 +211,8 @@ void IndexIVFAdditiveQuantizerFastScan::estimate_norm_scale(
 
     size_t index_nprobe = nprobe;
     nprobe = 1;
-    compute_LUT(n, x, coarse_ids.data(), coarse_dis.data(), dis_tables, biases);
+    CoarseQuantized cq{index_nprobe, coarse_dis.data(), coarse_ids.data()};
+    compute_LUT(n, x, cq, dis_tables, biases);
     nprobe = index_nprobe;
 
     float scale = 0;
@@ -313,13 +314,8 @@ void IndexIVFAdditiveQuantizerFastScan::search(
     }
 
     NormTableScaler scaler(norm_scale);
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(
-                n, x, k, distances, labels, nullptr, nullptr, scaler);
-    } else {
-        search_dispatch_implem<false>(
-                n, x, k, distances, labels, nullptr, nullptr, scaler);
-    }
+    IndexIVFFastScan::CoarseQuantized cq{nprobe};
+    search_dispatch_implem(n, x, k, distances, labels, cq, &scaler);
 }
 
 /*********************************************************
@@ -385,12 +381,12 @@ bool IndexIVFAdditiveQuantizerFastScan::lookup_table_is_3d() const {
 void IndexIVFAdditiveQuantizerFastScan::compute_LUT(
         size_t n,
         const float* x,
-        const idx_t* coarse_ids,
-        const float*,
+        const CoarseQuantized& cq,
         AlignedTable<float>& dis_tables,
         AlignedTable<float>& biases) const {
     const size_t dim12 = ksub * M;
     const size_t ip_dim12 = aq->M * ksub;
+    const size_t nprobe = cq.nprobe;
 
     dis_tables.resize(n * dim12);
 
@@ -411,7 +407,7 @@ void IndexIVFAdditiveQuantizerFastScan::compute_LUT(
 #pragma omp for
             for (idx_t ij = 0; ij < n * nprobe; ij++) {
                 int i = ij / nprobe;
-                quantizer->reconstruct(coarse_ids[ij], c);
+                quantizer->reconstruct(cq.ids[ij], c);
                 biases[ij] = coef * fvec_inner_product(c, x + i * d, d);
             }
         }
diff --git a/faiss/IndexIVFAdditiveQuantizerFastScan.h b/faiss/IndexIVFAdditiveQuantizerFastScan.h
index 24ce7287ec..643628dec1 100644
--- a/faiss/IndexIVFAdditiveQuantizerFastScan.h
+++ b/faiss/IndexIVFAdditiveQuantizerFastScan.h
@@ -93,8 +93,7 @@ struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan {
     void compute_LUT(
             size_t n,
             const float* x,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
+            const CoarseQuantized& cq,
             AlignedTable<float>& dis_tables,
             AlignedTable<float>& biases) const override;
 
diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
index 0b9c4e0992..3e40f7a3da 100644
--- a/faiss/IndexIVFFastScan.cpp
+++ b/faiss/IndexIVFFastScan.cpp
@@ -198,7 +198,7 @@ CodePacker* IndexIVFFastScan::get_CodePacker() const {
 
 namespace {
 
-template <class C, typename dis_t, class Scaler>
+template <class C, typename dis_t>
 void estimators_from_tables_generic(
         const IndexIVFFastScan& index,
         const uint8_t* codes,
@@ -209,22 +209,26 @@ void estimators_from_tables_generic(
         size_t k,
         typename C::T* heap_dis,
         int64_t* heap_ids,
-        const Scaler& scaler) {
+        const NormTableScaler* scaler) {
     using accu_t = typename C::T;
+    size_t nscale = scaler ? scaler->nscale : 0;
     for (size_t j = 0; j < ncodes; ++j) {
         BitstringReader bsr(codes + j * index.code_size, index.code_size);
         accu_t dis = bias;
         const dis_t* __restrict dt = dis_table;
-        for (size_t m = 0; m < index.M - scaler.nscale; m++) {
+
+        for (size_t m = 0; m < index.M - nscale; m++) {
             uint64_t c = bsr.read(index.nbits);
             dis += dt[c];
             dt += index.ksub;
         }
 
-        for (size_t m = 0; m < scaler.nscale; m++) {
-            uint64_t c = bsr.read(index.nbits);
-            dis += scaler.scale_one(dt[c]);
-            dt += index.ksub;
+        if (scaler) {
+            for (size_t m = 0; m < nscale; m++) {
+                uint64_t c = bsr.read(index.nbits);
+                dis += scaler->scale_one(dt[c]);
+                dt += index.ksub;
+            }
         }
 
         if (C::cmp(heap_dis[0], dis)) {
@@ -245,18 +249,15 @@ using namespace quantize_lut;
 void IndexIVFFastScan::compute_LUT_uint8(
         size_t n,
         const float* x,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
+        const CoarseQuantized& cq,
         AlignedTable<uint8_t>& dis_tables,
         AlignedTable<uint16_t>& biases,
         float* normalizers) const {
     AlignedTable<float> dis_tables_float;
     AlignedTable<float> biases_float;
 
-    uint64_t t0 = get_cy();
-    compute_LUT(n, x, coarse_ids, coarse_dis, dis_tables_float, biases_float);
-    IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
-
+    compute_LUT(n, x, cq, dis_tables_float, biases_float);
+    size_t nprobe = cq.nprobe;
     bool lut_is_3d = lookup_table_is_3d();
     size_t dim123 = ksub * M;
     size_t dim123_2 = ksub * M2;
@@ -268,8 +269,8 @@ void IndexIVFFastScan::compute_LUT_uint8(
     if (biases_float.get()) {
         biases.resize(n * nprobe);
     }
-    uint64_t t1 = get_cy();
 
+    // OMP for MSVC requires i to have signed integral type
 #pragma omp parallel for if (n > 100)
     for (int64_t i = 0; i < n; i++) {
         const float* t_in = dis_tables_float.get() + i * dim123;
@@ -294,7 +295,6 @@ void IndexIVFFastScan::compute_LUT_uint8(
                 normalizers + 2 * i,
                 normalizers + 2 * i + 1);
     }
-    IVFFastScan_stats.t_round += get_cy() - t1;
 }
 
 /*********************************************************
@@ -307,19 +307,16 @@ void IndexIVFFastScan::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
-    FAISS_THROW_IF_NOT(k > 0);
-
-    DummyScaler scaler;
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(
-                n, x, k, distances, labels, nullptr, nullptr, scaler);
-    } else {
-        search_dispatch_implem<false>(
-                n, x, k, distances, labels, nullptr, nullptr, scaler);
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
     }
+
+    search_preassigned(
+            n, x, k, nullptr, nullptr, distances, labels, false, params);
 }
 
 void IndexIVFFastScan::search_preassigned(
@@ -333,51 +330,172 @@ void IndexIVFFastScan::search_preassigned(
         bool store_pairs,
         const IVFSearchParameters* params,
         IndexIVFStats* stats) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
+    size_t nprobe = this->nprobe;
+    if (params) {
+        FAISS_THROW_IF_NOT(params->max_codes == 0);
+        nprobe = params->nprobe;
+    }
+
     FAISS_THROW_IF_NOT_MSG(
             !store_pairs, "store_pairs not supported for this index");
     FAISS_THROW_IF_NOT_MSG(!stats, "stats not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
-    DummyScaler scaler;
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(
-                n, x, k, distances, labels, assign, centroid_dis, scaler);
+    const CoarseQuantized cq = {nprobe, centroid_dis, assign};
+    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr, params);
+}
+
+void IndexIVFFastScan::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params_in) const {
+    size_t nprobe = this->nprobe;
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+        nprobe = params->nprobe;
+    }
+
+    const CoarseQuantized cq = {nprobe, nullptr, nullptr};
+    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr, params);
+}
+
+namespace {
+
+template <class C>
+ResultHandlerCompare<C, true>* make_knn_handler_fixC(
+        int impl,
+        idx_t n,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const IDSelector* sel) {
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+
+    if (k == 1) {
+        return new SingleResultHC(n, 0, distances, labels, sel);
+    } else if (impl % 2 == 0) {
+        return new HeapHC(n, 0, k, distances, labels, sel);
+    } else /* if (impl % 2 == 1) */ {
+        return new ReservoirHC(n, 0, k, 2 * k, distances, labels, sel);
+    }
+}
+
+SIMDResultHandlerToFloat* make_knn_handler(
+        bool is_max,
+        int impl,
+        idx_t n,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const IDSelector* sel) {
+    if (is_max) {
+        return make_knn_handler_fixC<CMax<uint16_t, int64_t>>(
+                impl, n, k, distances, labels, sel);
     } else {
-        search_dispatch_implem<false>(
-                n, x, k, distances, labels, assign, centroid_dis, scaler);
+        return make_knn_handler_fixC<CMin<uint16_t, int64_t>>(
+                impl, n, k, distances, labels, sel);
     }
 }
 
-void IndexIVFFastScan::range_search(
-        idx_t,
-        const float*,
-        float,
-        RangeSearchResult*,
-        const SearchParameters*) const {
-    FAISS_THROW_MSG("not implemented");
+using CoarseQuantized = IndexIVFFastScan::CoarseQuantized;
+
+struct CoarseQuantizedWithBuffer : CoarseQuantized {
+    explicit CoarseQuantizedWithBuffer(const CoarseQuantized& cq)
+            : CoarseQuantized(cq) {}
+
+    bool done() const {
+        return ids != nullptr;
+    }
+
+    std::vector<idx_t> ids_buffer;
+    std::vector<float> dis_buffer;
+
+    void quantize(
+            const Index* quantizer,
+            idx_t n,
+            const float* x,
+            const SearchParameters* quantizer_params) {
+        dis_buffer.resize(nprobe * n);
+        ids_buffer.resize(nprobe * n);
+        quantizer->search(
+                n,
+                x,
+                nprobe,
+                dis_buffer.data(),
+                ids_buffer.data(),
+                quantizer_params);
+        dis = dis_buffer.data();
+        ids = ids_buffer.data();
+    }
+};
+
+struct CoarseQuantizedSlice : CoarseQuantizedWithBuffer {
+    size_t i0, i1;
+    CoarseQuantizedSlice(const CoarseQuantized& cq, size_t i0, size_t i1)
+            : CoarseQuantizedWithBuffer(cq), i0(i0), i1(i1) {
+        if (done()) {
+            dis += nprobe * i0;
+            ids += nprobe * i0;
+        }
+    }
+
+    void quantize_slice(
+            const Index* quantizer,
+            const float* x,
+            const SearchParameters* quantizer_params) {
+        quantize(quantizer, i1 - i0, x + quantizer->d * i0, quantizer_params);
+    }
+};
+
+int compute_search_nslice(
+        const IndexIVFFastScan* index,
+        size_t n,
+        size_t nprobe) {
+    int nslice;
+    if (n <= omp_get_max_threads()) {
+        nslice = n;
+    } else if (index->lookup_table_is_3d()) {
+        // make sure we don't make too big LUT tables
+        size_t lut_size_per_query = index->M * index->ksub * nprobe *
+                (sizeof(float) + sizeof(uint8_t));
+
+        size_t max_lut_size = precomputed_table_max_bytes;
+        // how many queries we can handle within mem budget
+        size_t nq_ok = std::max(max_lut_size / lut_size_per_query, size_t(1));
+        nslice = roundup(
+                std::max(size_t(n / nq_ok), size_t(1)), omp_get_max_threads());
+    } else {
+        // LUTs unlikely to be a limiting factor
+        nslice = omp_get_max_threads();
+    }
+    return nslice;
 }
 
-template <bool is_max, class Scaler>
+} // namespace
+
 void IndexIVFFastScan::search_dispatch_implem(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        const Scaler& scaler) const {
-    using Cfloat = typename std::conditional<
-            is_max,
-            CMax<float, int64_t>,
-            CMin<float, int64_t>>::type;
-
-    using C = typename std::conditional<
-            is_max,
-            CMax<uint16_t, int64_t>,
-            CMin<uint16_t, int64_t>>::type;
+        const CoarseQuantized& cq_in,
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    bool is_max = !is_similarity_metric(metric_type);
+    using RH = SIMDResultHandlerToFloat;
 
     if (n == 0) {
         return;
@@ -392,94 +510,93 @@ void IndexIVFFastScan::search_dispatch_implem(
         } else {
             impl = 10;
         }
-        if (k > 20) {
+        if (k > 20) { // use reservoir rather than heap
             impl++;
         }
     }
 
+    bool multiple_threads =
+            n > 1 && impl >= 10 && impl <= 13 && omp_get_max_threads() > 1;
+    if (impl >= 100) {
+        multiple_threads = false;
+        impl -= 100;
+    }
+
+    CoarseQuantizedWithBuffer cq(cq_in);
+    cq.nprobe = nprobe;
+
+    if (!cq.done() && !multiple_threads) {
+        // we do the coarse quantization here execpt when search is
+        // sliced over threads (then it is more efficient to have each thread do
+        // its own coarse quantization)
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
+    }
+
     if (impl == 1) {
-        search_implem_1<Cfloat>(
-                n, x, k, distances, labels, coarse_ids, coarse_dis, scaler);
+        if (is_max) {
+            search_implem_1<CMax<float, int64_t>>(
+                    n, x, k, distances, labels, cq, scaler, params);
+        } else {
+            search_implem_1<CMin<float, int64_t>>(
+                    n, x, k, distances, labels, cq, scaler, params);
+        }
     } else if (impl == 2) {
-        search_implem_2<C>(
-                n, x, k, distances, labels, coarse_ids, coarse_dis, scaler);
-
+        if (is_max) {
+            search_implem_2<CMax<uint16_t, int64_t>>(
+                    n, x, k, distances, labels, cq, scaler, params);
+        } else {
+            search_implem_2<CMin<uint16_t, int64_t>>(
+                    n, x, k, distances, labels, cq, scaler, params);
+        }
     } else if (impl >= 10 && impl <= 15) {
         size_t ndis = 0, nlist_visited = 0;
 
-        if (n < 2) {
+        if (!multiple_threads) {
+            // clang-format off
             if (impl == 12 || impl == 13) {
-                search_implem_12<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
-                        labels,
-                        coarse_ids,
-                        coarse_dis,
-                        impl,
-                        &ndis,
-                        &nlist_visited,
-                        scaler);
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
+                        labels, sel
+                    )
+                );
+                search_implem_12(
+                        n, x, *handler.get(),
+                        cq, &ndis, &nlist_visited, scaler, params);
             } else if (impl == 14 || impl == 15) {
-                search_implem_14<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
-                        labels,
-                        coarse_ids,
-                        coarse_dis,
-                        impl,
-                        scaler);
+                search_implem_14(
+                        n, x, k, distances, labels,
+                        cq, impl, scaler, params);
             } else {
-                search_implem_10<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
                         labels,
-                        coarse_ids,
-                        coarse_dis,
-                        impl,
-                        &ndis,
-                        &nlist_visited,
-                        scaler);
+                        sel
+                    )
+                );
+                search_implem_10(
+                        n, x, *handler.get(), cq,
+                        &ndis, &nlist_visited, scaler, params);
             }
+            // clang-format on
         } else {
             // explicitly slice over threads
-            int nslice;
-            if (n <= omp_get_max_threads()) {
-                nslice = n;
-            } else if (lookup_table_is_3d()) {
-                // make sure we don't make too big LUT tables
-                size_t lut_size_per_query =
-                        M * ksub * nprobe * (sizeof(float) + sizeof(uint8_t));
-
-                size_t max_lut_size = precomputed_table_max_bytes;
-                // how many queries we can handle within mem budget
-                size_t nq_ok =
-                        std::max(max_lut_size / lut_size_per_query, size_t(1));
-                nslice =
-                        roundup(std::max(size_t(n / nq_ok), size_t(1)),
-                                omp_get_max_threads());
-            } else {
-                // LUTs unlikely to be a limiting factor
-                nslice = omp_get_max_threads();
-            }
-            if (impl == 14 ||
-                impl == 15) { // this might require slicing if there are too
-                              // many queries (for now we keep this simple)
-                search_implem_14<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
-                        labels,
-                        coarse_ids,
-                        coarse_dis,
-                        impl,
-                        scaler);
+            int nslice = compute_search_nslice(this, n, cq.nprobe);
+            if (impl == 14 || impl == 15) {
+                // this might require slicing if there are too
+                // many queries (for now we keep this simple)
+                search_implem_14(
+                        n, x, k, distances, labels, cq, impl, scaler, params);
             } else {
 #pragma omp parallel for reduction(+ : ndis, nlist_visited)
                 for (int slice = 0; slice < nslice; slice++) {
@@ -487,39 +604,23 @@ void IndexIVFFastScan::search_dispatch_implem(
                     idx_t i1 = n * (slice + 1) / nslice;
                     float* dis_i = distances + i0 * k;
                     idx_t* lab_i = labels + i0 * k;
-                    const idx_t* coarse_ids_i = coarse_ids != nullptr
-                            ? coarse_ids + i0 * nprobe
-                            : nullptr;
-                    const float* coarse_dis_i = coarse_dis != nullptr
-                            ? coarse_dis + i0 * nprobe
-                            : nullptr;
+                    CoarseQuantizedSlice cq_i(cq, i0, i1);
+                    if (!cq_i.done()) {
+                        cq_i.quantize_slice(quantizer, x, quantizer_params);
+                    }
+                    std::unique_ptr<RH> handler(make_knn_handler(
+                            is_max, impl, i1 - i0, k, dis_i, lab_i, sel));
+                    // clang-format off
                     if (impl == 12 || impl == 13) {
-                        search_implem_12<C>(
-                                i1 - i0,
-                                x + i0 * d,
-                                k,
-                                dis_i,
-                                lab_i,
-                                coarse_ids_i,
-                                coarse_dis_i,
-                                impl,
-                                &ndis,
-                                &nlist_visited,
-                                scaler);
+                        search_implem_12(
+                                i1 - i0, x + i0 * d, *handler.get(),
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     } else {
-                        search_implem_10<C>(
-                                i1 - i0,
-                                x + i0 * d,
-                                k,
-                                dis_i,
-                                lab_i,
-                                coarse_ids_i,
-                                coarse_dis_i,
-                                impl,
-                                &ndis,
-                                &nlist_visited,
-                                scaler);
+                        search_implem_10(
+                                i1 - i0, x + i0 * d, *handler.get(),
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     }
+                    // clang-format on
                 }
             }
         }
@@ -531,46 +632,149 @@ void IndexIVFFastScan::search_dispatch_implem(
     }
 }
 
-#define COARSE_QUANTIZE                                   \
-    std::unique_ptr<idx_t[]> coarse_ids_buffer;           \
-    std::unique_ptr<float[]> coarse_dis_buffer;           \
-    if (coarse_ids == nullptr || coarse_dis == nullptr) { \
-        coarse_ids_buffer.reset(new idx_t[n * nprobe]);   \
-        coarse_dis_buffer.reset(new float[n * nprobe]);   \
-        quantizer->search(                                \
-                n,                                        \
-                x,                                        \
-                nprobe,                                   \
-                coarse_dis_buffer.get(),                  \
-                coarse_ids_buffer.get());                 \
-        coarse_ids = coarse_ids_buffer.get();             \
-        coarse_dis = coarse_dis_buffer.get();             \
+void IndexIVFFastScan::range_search_dispatch_implem(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult& rres,
+        const CoarseQuantized& cq_in,
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    // const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    bool is_max = !is_similarity_metric(metric_type);
+
+    if (n == 0) {
+        return;
+    }
+
+    // actual implementation used
+    int impl = implem;
+
+    if (impl == 0) {
+        if (bbs == 32) {
+            impl = 12;
+        } else {
+            impl = 10;
+        }
+    }
+
+    CoarseQuantizedWithBuffer cq(cq_in);
+
+    bool multiple_threads =
+            n > 1 && impl >= 10 && impl <= 13 && omp_get_max_threads() > 1;
+    if (impl >= 100) {
+        multiple_threads = false;
+        impl -= 100;
+    }
+
+    if (!multiple_threads && !cq.done()) {
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
+    }
+
+    size_t ndis = 0, nlist_visited = 0;
+
+    if (!multiple_threads) { // single thread
+        std::unique_ptr<SIMDResultHandlerToFloat> handler;
+        if (is_max) {
+            handler.reset(new RangeHandler<CMax<uint16_t, int64_t>, true>(
+                    rres, radius, 0, sel));
+        } else {
+            handler.reset(new RangeHandler<CMin<uint16_t, int64_t>, true>(
+                    rres, radius, 0, sel));
+        }
+        if (impl == 12) {
+            search_implem_12(
+                    n, x, *handler.get(), cq, &ndis, &nlist_visited, scaler);
+        } else if (impl == 10) {
+            search_implem_10(
+                    n, x, *handler.get(), cq, &ndis, &nlist_visited, scaler);
+        } else {
+            FAISS_THROW_FMT("Range search implem %d not impemented", impl);
+        }
+    } else {
+        // explicitly slice over threads
+        int nslice = compute_search_nslice(this, n, cq.nprobe);
+#pragma omp parallel
+        {
+            RangeSearchPartialResult pres(&rres);
+
+#pragma omp for reduction(+ : ndis, nlist_visited)
+            for (int slice = 0; slice < nslice; slice++) {
+                idx_t i0 = n * slice / nslice;
+                idx_t i1 = n * (slice + 1) / nslice;
+                CoarseQuantizedSlice cq_i(cq, i0, i1);
+                if (!cq_i.done()) {
+                    cq_i.quantize_slice(quantizer, x, quantizer_params);
+                }
+                std::unique_ptr<SIMDResultHandlerToFloat> handler;
+                if (is_max) {
+                    handler.reset(new PartialRangeHandler<
+                                  CMax<uint16_t, int64_t>,
+                                  true>(pres, radius, 0, i0, i1, sel));
+                } else {
+                    handler.reset(new PartialRangeHandler<
+                                  CMin<uint16_t, int64_t>,
+                                  true>(pres, radius, 0, i0, i1, sel));
+                }
+
+                if (impl == 12 || impl == 13) {
+                    search_implem_12(
+                            i1 - i0,
+                            x + i0 * d,
+                            *handler.get(),
+                            cq_i,
+                            &ndis,
+                            &nlist_visited,
+                            scaler,
+                            params);
+                } else {
+                    search_implem_10(
+                            i1 - i0,
+                            x + i0 * d,
+                            *handler.get(),
+                            cq_i,
+                            &ndis,
+                            &nlist_visited,
+                            scaler,
+                            params);
+                }
+            }
+            pres.finalize();
+        }
     }
 
-template <class C, class Scaler>
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+
+template <class C>
 void IndexIVFFastScan::search_implem_1(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        const Scaler& scaler) const {
+        const CoarseQuantized& cq,
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
-    COARSE_QUANTIZE;
-
     size_t dim12 = ksub * M;
     AlignedTable<float> dis_tables;
     AlignedTable<float> biases;
 
-    compute_LUT(n, x, coarse_ids, coarse_dis, dis_tables, biases);
+    compute_LUT(n, x, cq, dis_tables, biases);
 
     bool single_LUT = !lookup_table_is_3d();
 
     size_t ndis = 0, nlist_visited = 0;
-
+    size_t nprobe = cq.nprobe;
 #pragma omp parallel for reduction(+ : ndis, nlist_visited)
     for (idx_t i = 0; i < n; i++) {
         int64_t* heap_ids = labels + i * k;
@@ -585,7 +789,7 @@ void IndexIVFFastScan::search_implem_1(
             if (!single_LUT) {
                 LUT = dis_tables.get() + (i * nprobe + j) * dim12;
             }
-            idx_t list_no = coarse_ids[i * nprobe + j];
+            idx_t list_no = cq.ids[i * nprobe + j];
             if (list_no < 0)
                 continue;
             size_t ls = orig_invlists->list_size(list_no);
@@ -617,36 +821,29 @@ void IndexIVFFastScan::search_implem_1(
     indexIVF_stats.nlist += nlist_visited;
 }
 
-template <class C, class Scaler>
+template <class C>
 void IndexIVFFastScan::search_implem_2(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        const Scaler& scaler) const {
+        const CoarseQuantized& cq,
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
-    COARSE_QUANTIZE;
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids,
-            coarse_dis,
-            dis_tables,
-            biases,
-            normalizers.get());
+    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
 
     bool single_LUT = !lookup_table_is_3d();
 
     size_t ndis = 0, nlist_visited = 0;
+    size_t nprobe = cq.nprobe;
 
 #pragma omp parallel for reduction(+ : ndis, nlist_visited)
     for (idx_t i = 0; i < n; i++) {
@@ -663,7 +860,7 @@ void IndexIVFFastScan::search_implem_2(
             if (!single_LUT) {
                 LUT = dis_tables.get() + (i * nprobe + j) * dim12;
             }
-            idx_t list_no = coarse_ids[i * nprobe + j];
+            idx_t list_no = cq.ids[i * nprobe + j];
             if (list_no < 0)
                 continue;
             size_t ls = orig_invlists->list_size(list_no);
@@ -708,169 +905,103 @@ void IndexIVFFastScan::search_implem_2(
     indexIVF_stats.nlist += nlist_visited;
 }
 
-template <class C, class Scaler>
 void IndexIVFFastScan::search_implem_10(
         idx_t n,
         const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        int impl,
+        SIMDResultHandlerToFloat& handler,
+        const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const Scaler& scaler) const {
-    memset(distances, -1, sizeof(float) * k * n);
-    memset(labels, -1, sizeof(idx_t) * k * n);
-
-    using HeapHC = HeapHandler<C, true>;
-    using ReservoirHC = ReservoirHandler<C, true>;
-    using SingleResultHC = SingleResultHandler<C, true>;
-
-    uint64_t times[10];
-    memset(times, 0, sizeof(times));
-    int ti = 0;
-#define TIC times[ti++] = get_cy()
-    TIC;
-
-    COARSE_QUANTIZE;
-
-    TIC;
-
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids,
-            coarse_dis,
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    TIC;
+    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
 
     bool single_LUT = !lookup_table_is_3d();
 
-    TIC;
-    size_t ndis = 0, nlist_visited = 0;
+    size_t ndis = 0;
+    int qmap1[1];
 
-    {
-        AlignedTable<uint16_t> tmp_distances(k);
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* LUT = nullptr;
-            int qmap1[1] = {0};
-            std::unique_ptr<SIMDResultHandler<C, true>> handler;
-
-            if (k == 1) {
-                handler.reset(new SingleResultHC(1, 0));
-            } else if (impl == 10) {
-                handler.reset(new HeapHC(
-                        1, tmp_distances.get(), labels + i * k, k, 0));
-            } else if (impl == 11) {
-                handler.reset(new ReservoirHC(1, 0, k, 2 * k));
-            } else {
-                FAISS_THROW_MSG("invalid");
-            }
+    handler.q_map = qmap1;
+    handler.begin(skip & 16 ? nullptr : normalizers.get());
+    size_t nprobe = cq.nprobe;
 
-            handler->q_map = qmap1;
+    for (idx_t i = 0; i < n; i++) {
+        const uint8_t* LUT = nullptr;
+        qmap1[0] = i;
 
-            if (single_LUT) {
-                LUT = dis_tables.get() + i * dim12;
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for (idx_t j = 0; j < nprobe; j++) {
+            size_t ij = i * nprobe + j;
+            if (!single_LUT) {
+                LUT = dis_tables.get() + ij * dim12;
+            }
+            if (biases.get()) {
+                handler.dbias = biases.get() + ij;
             }
-            for (idx_t j = 0; j < nprobe; j++) {
-                size_t ij = i * nprobe + j;
-                if (!single_LUT) {
-                    LUT = dis_tables.get() + ij * dim12;
-                }
-                if (biases.get()) {
-                    handler->dbias = biases.get() + ij;
-                }
-
-                idx_t list_no = coarse_ids[ij];
-                if (list_no < 0)
-                    continue;
-                size_t ls = invlists->list_size(list_no);
-                if (ls == 0)
-                    continue;
 
-                InvertedLists::ScopedCodes codes(invlists, list_no);
-                InvertedLists::ScopedIds ids(invlists, list_no);
+            idx_t list_no = cq.ids[ij];
+            if (list_no < 0) {
+                continue;
+            }
+            size_t ls = invlists->list_size(list_no);
+            if (ls == 0) {
+                continue;
+            }
 
-                handler->ntotal = ls;
-                handler->id_map = ids.get();
+            InvertedLists::ScopedCodes codes(invlists, list_no);
+            InvertedLists::ScopedIds ids(invlists, list_no);
 
-#define DISPATCH(classHC)                                                      \
-    if (dynamic_cast<classHC*>(handler.get())) {                               \
-        auto* res = static_cast<classHC*>(handler.get());                      \
-        pq4_accumulate_loop(                                                   \
-                1, roundup(ls, bbs), bbs, M2, codes.get(), LUT, *res, scaler); \
-    }
-                DISPATCH(HeapHC)
-                else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
-#undef DISPATCH
+            handler.ntotal = ls;
+            handler.id_map = ids.get();
 
-                        nlist_visited++;
-                ndis++;
-            }
+            pq4_accumulate_loop(
+                    1,
+                    roundup(ls, bbs),
+                    bbs,
+                    M2,
+                    codes.get(),
+                    LUT,
+                    handler,
+                    scaler);
 
-            handler->to_flat_arrays(
-                    distances + i * k,
-                    labels + i * k,
-                    skip & 16 ? nullptr : normalizers.get() + i * 2);
+            ndis++;
         }
     }
+
+    handler.end();
     *ndis_out = ndis;
     *nlist_out = nlist;
 }
 
-template <class C, class Scaler>
 void IndexIVFFastScan::search_implem_12(
         idx_t n,
         const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        int impl,
+        SIMDResultHandlerToFloat& handler,
+        const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
     FAISS_THROW_IF_NOT(bbs == 32);
 
-    uint64_t times[10];
-    memset(times, 0, sizeof(times));
-    int ti = 0;
-#define TIC times[ti++] = get_cy()
-    TIC;
-
-    COARSE_QUANTIZE;
-
-    TIC;
-
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids,
-            coarse_dis,
-            dis_tables,
-            biases,
-            normalizers.get());
+    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
 
-    TIC;
+    handler.begin(skip & 16 ? nullptr : normalizers.get());
 
     struct QC {
         int qno;     // sequence number of the query
@@ -878,14 +1009,15 @@ void IndexIVFFastScan::search_implem_12(
         int rank;    // this is the rank'th result of the coarse quantizer
     };
     bool single_LUT = !lookup_table_is_3d();
+    size_t nprobe = cq.nprobe;
 
     std::vector<QC> qcs;
     {
         int ij = 0;
         for (int i = 0; i < n; i++) {
             for (int j = 0; j < nprobe; j++) {
-                if (coarse_ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                if (cq.ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(cq.ids[ij]), int(j)});
                 }
                 ij++;
             }
@@ -894,42 +1026,22 @@ void IndexIVFFastScan::search_implem_12(
             return a.list_no < b.list_no;
         });
     }
-    TIC;
 
     // prepare the result handlers
 
-    std::unique_ptr<SIMDResultHandler<C, true>> handler;
-    AlignedTable<uint16_t> tmp_distances;
-
-    using HeapHC = HeapHandler<C, true>;
-    using ReservoirHC = ReservoirHandler<C, true>;
-    using SingleResultHC = SingleResultHandler<C, true>;
-
-    if (k == 1) {
-        handler.reset(new SingleResultHC(n, 0));
-    } else if (impl == 12) {
-        tmp_distances.resize(n * k);
-        handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0));
-    } else if (impl == 13) {
-        handler.reset(new ReservoirHC(n, 0, k, 2 * k));
-    }
-
     int qbs2 = this->qbs2 ? this->qbs2 : 11;
 
     std::vector<uint16_t> tmp_bias;
     if (biases.get()) {
         tmp_bias.resize(qbs2);
-        handler->dbias = tmp_bias.data();
+        handler.dbias = tmp_bias.data();
     }
-    TIC;
 
     size_t ndis = 0;
 
     size_t i0 = 0;
     uint64_t t_copy_pack = 0, t_scan = 0;
     while (i0 < qcs.size()) {
-        uint64_t tt0 = get_cy();
-
         // find all queries that access this inverted list
         int list_no = qcs[i0].list_no;
         size_t i1 = i0 + 1;
@@ -977,92 +1089,50 @@ void IndexIVFFastScan::search_implem_12(
 
         // prepare the handler
 
-        handler->ntotal = list_size;
-        handler->q_map = q_map.data();
-        handler->id_map = ids.get();
-        uint64_t tt1 = get_cy();
+        handler.ntotal = list_size;
+        handler.q_map = q_map.data();
+        handler.id_map = ids.get();
 
-#define DISPATCH(classHC)                                                  \
-    if (dynamic_cast<classHC*>(handler.get())) {                           \
-        auto* res = static_cast<classHC*>(handler.get());                  \
-        pq4_accumulate_loop_qbs(                                           \
-                qbs, list_size, M2, codes.get(), LUT.get(), *res, scaler); \
-    }
-        DISPATCH(HeapHC)
-        else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
-
-                // prepare for next loop
-                i0 = i1;
-
-        uint64_t tt2 = get_cy();
-        t_copy_pack += tt1 - tt0;
-        t_scan += tt2 - tt1;
+        pq4_accumulate_loop_qbs(
+                qbs, list_size, M2, codes.get(), LUT.get(), handler, scaler);
+        // prepare for next loop
+        i0 = i1;
     }
-    TIC;
 
-    // labels is in-place for HeapHC
-    handler->to_flat_arrays(
-            distances, labels, skip & 16 ? nullptr : normalizers.get());
-
-    TIC;
+    handler.end();
 
     // these stats are not thread-safe
 
-    for (int i = 1; i < ti; i++) {
-        IVFFastScan_stats.times[i] += times[i] - times[i - 1];
-    }
     IVFFastScan_stats.t_copy_pack += t_copy_pack;
     IVFFastScan_stats.t_scan += t_scan;
 
-    if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
-        for (int i = 0; i < 4; i++) {
-            IVFFastScan_stats.reservoir_times[i] += rh->times[i];
-        }
-    }
-
     *ndis_out = ndis;
     *nlist_out = nlist;
 }
 
-template <class C, class Scaler>
 void IndexIVFFastScan::search_implem_14(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
+        const CoarseQuantized& cq,
         int impl,
-        const Scaler& scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
     FAISS_THROW_IF_NOT(bbs == 32);
 
-    uint64_t ttg0 = get_cy();
-
-    COARSE_QUANTIZE;
-
-    uint64_t ttg1 = get_cy();
-    uint64_t coarse_search_tt = ttg1 - ttg0;
+    const IDSelector* sel = params ? params->sel : nullptr;
 
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids,
-            coarse_dis,
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    uint64_t ttg2 = get_cy();
-    uint64_t lut_compute_tt = ttg2 - ttg1;
+    compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
 
     struct QC {
         int qno;     // sequence number of the query
@@ -1070,14 +1140,15 @@ void IndexIVFFastScan::search_implem_14(
         int rank;    // this is the rank'th result of the coarse quantizer
     };
     bool single_LUT = !lookup_table_is_3d();
+    size_t nprobe = cq.nprobe;
 
     std::vector<QC> qcs;
     {
         int ij = 0;
         for (int i = 0; i < n; i++) {
             for (int j = 0; j < nprobe; j++) {
-                if (coarse_ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                if (cq.ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(cq.ids[ij]), int(j)});
                 }
                 ij++;
             }
@@ -1115,14 +1186,13 @@ void IndexIVFFastScan::search_implem_14(
         ses.push_back(SE{i0_l, i1, list_size});
         i0_l = i1;
     }
-    uint64_t ttg3 = get_cy();
-    uint64_t compute_clusters_tt = ttg3 - ttg2;
 
     // function to handle the global heap
+    bool is_max = !is_similarity_metric(metric_type);
     using HeapForIP = CMin<float, idx_t>;
     using HeapForL2 = CMax<float, idx_t>;
     auto init_result = [&](float* simi, idx_t* idxi) {
-        if (metric_type == METRIC_INNER_PRODUCT) {
+        if (!is_max) {
             heap_heapify<HeapForIP>(k, simi, idxi);
         } else {
             heap_heapify<HeapForL2>(k, simi, idxi);
@@ -1133,7 +1203,7 @@ void IndexIVFFastScan::search_implem_14(
                                  const idx_t* local_idx,
                                  float* simi,
                                  idx_t* idxi) {
-        if (metric_type == METRIC_INNER_PRODUCT) {
+        if (!is_max) {
             heap_addn<HeapForIP>(k, simi, idxi, local_dis, local_idx, k);
         } else {
             heap_addn<HeapForL2>(k, simi, idxi, local_dis, local_idx, k);
@@ -1141,14 +1211,12 @@ void IndexIVFFastScan::search_implem_14(
     };
 
     auto reorder_result = [&](float* simi, idx_t* idxi) {
-        if (metric_type == METRIC_INNER_PRODUCT) {
+        if (!is_max) {
             heap_reorder<HeapForIP>(k, simi, idxi);
         } else {
             heap_reorder<HeapForL2>(k, simi, idxi);
         }
     };
-    uint64_t ttg4 = get_cy();
-    uint64_t fn_tt = ttg4 - ttg3;
 
     size_t ndis = 0;
     size_t nlist_visited = 0;
@@ -1160,22 +1228,9 @@ void IndexIVFFastScan::search_implem_14(
         std::vector<float> local_dis(k * n);
 
         // prepare the result handlers
-        std::unique_ptr<SIMDResultHandler<C, true>> handler;
-        AlignedTable<uint16_t> tmp_distances;
-
-        using HeapHC = HeapHandler<C, true>;
-        using ReservoirHC = ReservoirHandler<C, true>;
-        using SingleResultHC = SingleResultHandler<C, true>;
-
-        if (k == 1) {
-            handler.reset(new SingleResultHC(n, 0));
-        } else if (impl == 14) {
-            tmp_distances.resize(n * k);
-            handler.reset(
-                    new HeapHC(n, tmp_distances.get(), local_idx.data(), k, 0));
-        } else if (impl == 15) {
-            handler.reset(new ReservoirHC(n, 0, k, 2 * k));
-        }
+        std::unique_ptr<SIMDResultHandlerToFloat> handler(make_knn_handler(
+                is_max, impl, n, k, local_dis.data(), local_idx.data(), sel));
+        handler->begin(normalizers.get());
 
         int qbs2 = this->qbs2 ? this->qbs2 : 11;
 
@@ -1185,14 +1240,10 @@ void IndexIVFFastScan::search_implem_14(
             handler->dbias = tmp_bias.data();
         }
 
-        uint64_t ttg5 = get_cy();
-        uint64_t handler_tt = ttg5 - ttg4;
-
         std::set<int> q_set;
         uint64_t t_copy_pack = 0, t_scan = 0;
 #pragma omp for schedule(dynamic)
         for (idx_t cluster = 0; cluster < ses.size(); cluster++) {
-            uint64_t tt0 = get_cy();
             size_t i0 = ses[cluster].start;
             size_t i1 = ses[cluster].end;
             size_t list_size = ses[cluster].list_size;
@@ -1232,28 +1283,21 @@ void IndexIVFFastScan::search_implem_14(
             handler->ntotal = list_size;
             handler->q_map = q_map.data();
             handler->id_map = ids.get();
-            uint64_t tt1 = get_cy();
 
-#define DISPATCH(classHC)                                                  \
-    if (dynamic_cast<classHC*>(handler.get())) {                           \
-        auto* res = static_cast<classHC*>(handler.get());                  \
-        pq4_accumulate_loop_qbs(                                           \
-                qbs, list_size, M2, codes.get(), LUT.get(), *res, scaler); \
-    }
-            DISPATCH(HeapHC)
-            else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
-
-                    uint64_t tt2 = get_cy();
-            t_copy_pack += tt1 - tt0;
-            t_scan += tt2 - tt1;
+            pq4_accumulate_loop_qbs(
+                    qbs,
+                    list_size,
+                    M2,
+                    codes.get(),
+                    LUT.get(),
+                    *handler.get(),
+                    scaler);
         }
 
         // labels is in-place for HeapHC
-        handler->to_flat_arrays(
-                local_dis.data(),
-                local_idx.data(),
-                skip & 16 ? nullptr : normalizers.get());
+        handler->end();
 
+        // merge per-thread results
 #pragma omp single
         {
             // we init the results as a heap
@@ -1276,12 +1320,6 @@ void IndexIVFFastScan::search_implem_14(
 
             IVFFastScan_stats.t_copy_pack += t_copy_pack;
             IVFFastScan_stats.t_scan += t_scan;
-
-            if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
-                for (int i = 0; i < 4; i++) {
-                    IVFFastScan_stats.reservoir_times[i] += rh->times[i];
-                }
-            }
         }
 #pragma omp barrier
 #pragma omp single
@@ -1351,24 +1389,4 @@ void IndexIVFFastScan::reconstruct_orig_invlists() {
 
 IVFFastScanStats IVFFastScan_stats;
 
-template void IndexIVFFastScan::search_dispatch_implem<true, NormTableScaler>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        const NormTableScaler& scaler) const;
-
-template void IndexIVFFastScan::search_dispatch_implem<false, NormTableScaler>(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        const NormTableScaler& scaler) const;
-
 } // namespace faiss
diff --git a/faiss/IndexIVFFastScan.h b/faiss/IndexIVFFastScan.h
index 824e63ed28..9d4c4910d3 100644
--- a/faiss/IndexIVFFastScan.h
+++ b/faiss/IndexIVFFastScan.h
@@ -14,6 +14,9 @@
 
 namespace faiss {
 
+struct NormTableScaler;
+struct SIMDResultHandlerToFloat;
+
 /** Fast scan version of IVFPQ and IVFAQ. Works for 4-bit PQ/AQ for now.
  *
  * The codes in the inverted lists are not stored sequentially but
@@ -28,6 +31,12 @@ namespace faiss {
  * 11: idem, collect results in reservoir
  * 12: optimizer int16 search, collect results in heap, uses qbs
  * 13: idem, collect results in reservoir
+ * 14: internally multithreaded implem over nq * nprobe
+ * 15: same with reservoir
+ *
+ * For range search, only 10 and 12 are supported.
+ * add 100 to the implem to force single-thread scanning (the coarse quantizer
+ * may still use multiple threads).
  */
 
 struct IndexIVFFastScan : IndexIVF {
@@ -80,19 +89,24 @@ struct IndexIVFFastScan : IndexIVF {
 
     virtual bool lookup_table_is_3d() const = 0;
 
+    // compact way of conveying coarse quantization results
+    struct CoarseQuantized {
+        size_t nprobe;
+        const float* dis = nullptr;
+        const idx_t* ids = nullptr;
+    };
+
     virtual void compute_LUT(
             size_t n,
             const float* x,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
+            const CoarseQuantized& cq,
             AlignedTable<float>& dis_tables,
             AlignedTable<float>& biases) const = 0;
 
     void compute_LUT_uint8(
             size_t n,
             const float* x,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
+            const CoarseQuantized& cq,
             AlignedTable<uint8_t>& dis_tables,
             AlignedTable<uint16_t>& biases,
             float* normalizers) const;
@@ -117,7 +131,6 @@ struct IndexIVFFastScan : IndexIVF {
             const IVFSearchParameters* params = nullptr,
             IndexIVFStats* stats = nullptr) const override;
 
-    /// will just fail
     void range_search(
             idx_t n,
             const float* x,
@@ -127,81 +140,82 @@ struct IndexIVFFastScan : IndexIVF {
 
     // internal search funcs
 
-    template <bool is_max, class Scaler>
+    // dispatch to implementations and parallelize
     void search_dispatch_implem(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            const Scaler& scaler) const;
+            const CoarseQuantized& cq,
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    void range_search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult& rres,
+            const CoarseQuantized& cq_in,
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
-    template <class C, class Scaler>
+    // impl 1 and 2 are just for verification
+    template <class C>
     void search_implem_1(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            const Scaler& scaler) const;
+            const CoarseQuantized& cq,
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
-    template <class C, class Scaler>
+    template <class C>
     void search_implem_2(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            const Scaler& scaler) const;
+            const CoarseQuantized& cq,
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 10 and 12 are not multithreaded internally, so
     // export search stats
-    template <class C, class Scaler>
     void search_implem_10(
             idx_t n,
             const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            int impl,
+            SIMDResultHandlerToFloat& handler,
+            const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
-    template <class C, class Scaler>
     void search_implem_12(
             idx_t n,
             const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            int impl,
+            SIMDResultHandlerToFloat& handler,
+            const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 14 is multithreaded internally across nprobes and queries
-    template <class C, class Scaler>
     void search_implem_14(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
+            const CoarseQuantized& cq,
             int impl,
-            const Scaler& scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // reconstruct vectors from packed invlists
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
diff --git a/faiss/IndexIVFFlat.cpp b/faiss/IndexIVFFlat.cpp
index e985683eba..1b36fea379 100644
--- a/faiss/IndexIVFFlat.cpp
+++ b/faiss/IndexIVFFlat.cpp
@@ -47,7 +47,8 @@ void IndexIVFFlat::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
     FAISS_THROW_IF_NOT(is_trained);
     FAISS_THROW_IF_NOT(coarse_idx);
     FAISS_THROW_IF_NOT(!by_residual);
@@ -70,8 +71,8 @@ void IndexIVFFlat::add_core(
             if (list_no >= 0 && list_no % nt == rank) {
                 idx_t id = xids ? xids[i] : ntotal + i;
                 const float* xi = x + i * d;
-                size_t offset =
-                        invlists->add_entry(list_no, id, (const uint8_t*)xi);
+                size_t offset = invlists->add_entry(
+                        list_no, id, (const uint8_t*)xi, inverted_list_context);
                 dm_adder.add(i, list_no, offset);
                 n_add++;
             } else if (rank == 0 && list_no == -1) {
diff --git a/faiss/IndexIVFFlat.h b/faiss/IndexIVFFlat.h
index a0233052fa..8e47547e02 100644
--- a/faiss/IndexIVFFlat.h
+++ b/faiss/IndexIVFFlat.h
@@ -32,7 +32,8 @@ struct IndexIVFFlat : IndexIVF {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
 
     void encode_vectors(
             idx_t n,
diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index c433991c9b..0b7f4d05d4 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -135,8 +135,9 @@ void IndexIVFPQ::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
-    add_core_o(n, x, xids, nullptr, coarse_idx);
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
+    add_core_o(n, x, xids, nullptr, coarse_idx, inverted_list_context);
 }
 
 static std::unique_ptr<float[]> compute_residuals(
@@ -212,7 +213,8 @@ void IndexIVFPQ::add_core_o(
         const float* x,
         const idx_t* xids,
         float* residuals_2,
-        const idx_t* precomputed_idx) {
+        const idx_t* precomputed_idx,
+        void* inverted_list_context) {
     idx_t bs = index_ivfpq_add_core_o_bs;
     if (n > bs) {
         for (idx_t i0 = 0; i0 < n; i0 += bs) {
@@ -229,7 +231,8 @@ void IndexIVFPQ::add_core_o(
                     x + i0 * d,
                     xids ? xids + i0 : nullptr,
                     residuals_2 ? residuals_2 + i0 * d : nullptr,
-                    precomputed_idx ? precomputed_idx + i0 : nullptr);
+                    precomputed_idx ? precomputed_idx + i0 : nullptr,
+                    inverted_list_context);
         }
         return;
     }
@@ -281,7 +284,8 @@ void IndexIVFPQ::add_core_o(
         }
 
         uint8_t* code = xcodes.get() + i * code_size;
-        size_t offset = invlists->add_entry(key, id, code);
+        size_t offset =
+                invlists->add_entry(key, id, code, inverted_list_context);
 
         if (residuals_2) {
             float* res2 = residuals_2 + i * d;
@@ -749,7 +753,7 @@ struct QueryTables {
     }
 };
 
-// This way of handling the sleector is not optimal since all distances
+// This way of handling the selector is not optimal since all distances
 // are computed even if the id would filter it out.
 template <class C, bool use_sel>
 struct KnnSearchResults {
@@ -1042,7 +1046,7 @@ struct IVFPQScannerT : QueryTables {
             const uint8_t* codes,
             SearchResultType& res) const {
         int ht = ivfpq.polysemous_ht;
-        size_t n_hamming_pass = 0, nup = 0;
+        size_t n_hamming_pass = 0;
 
         int code_size = pq.code_size;
 
diff --git a/faiss/IndexIVFPQ.h b/faiss/IndexIVFPQ.h
index ab49f1e549..d5d21da49d 100644
--- a/faiss/IndexIVFPQ.h
+++ b/faiss/IndexIVFPQ.h
@@ -71,7 +71,8 @@ struct IndexIVFPQ : IndexIVF {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
 
     /// same as add_core, also:
     /// - output 2nd level residuals if residuals_2 != NULL
@@ -81,7 +82,8 @@ struct IndexIVFPQ : IndexIVF {
             const float* x,
             const idx_t* xids,
             float* residuals_2,
-            const idx_t* precomputed_idx = nullptr);
+            const idx_t* precomputed_idx = nullptr,
+            void* inverted_list_context = nullptr);
 
     /// trains the product quantizer
     void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
diff --git a/faiss/IndexIVFPQFastScan.cpp b/faiss/IndexIVFPQFastScan.cpp
index b44b71ec67..2844ae4936 100644
--- a/faiss/IndexIVFPQFastScan.cpp
+++ b/faiss/IndexIVFPQFastScan.cpp
@@ -171,7 +171,7 @@ void IndexIVFPQFastScan::encode_vectors(
  * Look-Up Table functions
  *********************************************************/
 
-void fvec_madd_avx(
+void fvec_madd_simd(
         size_t n,
         const float* a,
         float bf,
@@ -202,12 +202,12 @@ bool IndexIVFPQFastScan::lookup_table_is_3d() const {
 void IndexIVFPQFastScan::compute_LUT(
         size_t n,
         const float* x,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
+        const CoarseQuantized& cq,
         AlignedTable<float>& dis_tables,
         AlignedTable<float>& biases) const {
     size_t dim12 = pq.ksub * pq.M;
     size_t d = pq.d;
+    size_t nprobe = this->nprobe;
 
     if (by_residual) {
         if (metric_type == METRIC_L2) {
@@ -215,7 +215,7 @@ void IndexIVFPQFastScan::compute_LUT(
 
             if (use_precomputed_table == 1) {
                 biases.resize(n * nprobe);
-                memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+                memcpy(biases.get(), cq.dis, sizeof(float) * n * nprobe);
 
                 AlignedTable<float> ip_table(n * dim12);
                 pq.compute_inner_prod_tables(n, x, ip_table.get());
@@ -224,10 +224,10 @@ void IndexIVFPQFastScan::compute_LUT(
                 for (idx_t ij = 0; ij < n * nprobe; ij++) {
                     idx_t i = ij / nprobe;
                     float* tab = dis_tables.get() + ij * dim12;
-                    idx_t cij = coarse_ids[ij];
+                    idx_t cij = cq.ids[ij];
 
                     if (cij >= 0) {
-                        fvec_madd_avx(
+                        fvec_madd_simd(
                                 dim12,
                                 precomputed_table.get() + cij * dim12,
                                 -2,
@@ -249,7 +249,7 @@ void IndexIVFPQFastScan::compute_LUT(
                 for (idx_t ij = 0; ij < n * nprobe; ij++) {
                     idx_t i = ij / nprobe;
                     float* xij = &xrel[ij * d];
-                    idx_t cij = coarse_ids[ij];
+                    idx_t cij = cq.ids[ij];
 
                     if (cij >= 0) {
                         quantizer->compute_residual(x + i * d, xij, cij);
@@ -269,7 +269,7 @@ void IndexIVFPQFastScan::compute_LUT(
             // compute_inner_prod_tables(pq, n, x, dis_tables.get());
 
             biases.resize(n * nprobe);
-            memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+            memcpy(biases.get(), cq.dis, sizeof(float) * n * nprobe);
         } else {
             FAISS_THROW_FMT("metric %d not supported", metric_type);
         }
@@ -286,9 +286,28 @@ void IndexIVFPQFastScan::compute_LUT(
     }
 }
 
-void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x)
+void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* codes, float* x)
         const {
-    pq.decode(bytes, x, n);
+    size_t coarse_size = coarse_code_size();
+
+#pragma omp parallel if (n > 1)
+    {
+        std::vector<float> residual(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+            pq.decode(code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct(list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
 }
 
 } // namespace faiss
diff --git a/faiss/IndexIVFPQFastScan.h b/faiss/IndexIVFPQFastScan.h
index 9a79833591..00dd2f11dd 100644
--- a/faiss/IndexIVFPQFastScan.h
+++ b/faiss/IndexIVFPQFastScan.h
@@ -77,8 +77,7 @@ struct IndexIVFPQFastScan : IndexIVFFastScan {
     void compute_LUT(
             size_t n,
             const float* x,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
+            const CoarseQuantized& cq,
             AlignedTable<float>& dis_tables,
             AlignedTable<float>& biases) const override;
 
diff --git a/faiss/IndexIVFPQR.cpp b/faiss/IndexIVFPQR.cpp
index 2dd967e829..f55332cddf 100644
--- a/faiss/IndexIVFPQR.cpp
+++ b/faiss/IndexIVFPQR.cpp
@@ -91,7 +91,8 @@ void IndexIVFPQR::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* precomputed_idx) {
+        const idx_t* precomputed_idx,
+        void* /*inverted_list_context*/) {
     std::unique_ptr<float[]> residual_2(new float[n * d]);
 
     idx_t n0 = ntotal;
diff --git a/faiss/IndexIVFPQR.h b/faiss/IndexIVFPQR.h
index 73502879f2..7642d2f232 100644
--- a/faiss/IndexIVFPQR.h
+++ b/faiss/IndexIVFPQR.h
@@ -48,7 +48,8 @@ struct IndexIVFPQR : IndexIVFPQ {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp
index 5fb864c9a9..14741691d0 100644
--- a/faiss/IndexIVFSpectralHash.cpp
+++ b/faiss/IndexIVFSpectralHash.cpp
@@ -157,7 +157,7 @@ void binarize_with_freq(
     }
 }
 
-}; // namespace
+} // namespace
 
 void IndexIVFSpectralHash::encode_vectors(
         idx_t n,
diff --git a/faiss/IndexNNDescent.cpp b/faiss/IndexNNDescent.cpp
index 27bd6e33ee..382e9c41c6 100644
--- a/faiss/IndexNNDescent.cpp
+++ b/faiss/IndexNNDescent.cpp
@@ -58,35 +58,6 @@ using storage_idx_t = NNDescent::storage_idx_t;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
diff --git a/faiss/IndexNSG.cpp b/faiss/IndexNSG.cpp
index 2d1c3d820f..45a547b93b 100644
--- a/faiss/IndexNSG.cpp
+++ b/faiss/IndexNSG.cpp
@@ -104,7 +104,7 @@ void IndexNSG::search(
     }
 }
 
-void IndexNSG::build(idx_t n, const float* x, idx_t* knn_graph, int GK) {
+void IndexNSG::build(idx_t n, const float* x, idx_t* knn_graph, int GK_2) {
     FAISS_THROW_IF_NOT_MSG(
             storage,
             "Please use IndexNSGFlat (or variants) instead of IndexNSG directly");
@@ -115,9 +115,9 @@ void IndexNSG::build(idx_t n, const float* x, idx_t* knn_graph, int GK) {
     ntotal = storage->ntotal;
 
     // check the knn graph
-    check_knn_graph(knn_graph, n, GK);
+    check_knn_graph(knn_graph, n, GK_2);
 
-    const nsg::Graph<idx_t> knng(knn_graph, n, GK);
+    const nsg::Graph<idx_t> knng(knn_graph, n, GK_2);
     nsg.build(storage, n, knng, verbose);
     is_built = true;
 }
diff --git a/faiss/IndexReplicas.cpp b/faiss/IndexReplicas.cpp
index 8295f34a60..85f3fda746 100644
--- a/faiss/IndexReplicas.cpp
+++ b/faiss/IndexReplicas.cpp
@@ -201,7 +201,7 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
 }
 
 // explicit instantiations
-template struct IndexReplicasTemplate<Index>;
-template struct IndexReplicasTemplate<IndexBinary>;
+template class IndexReplicasTemplate<Index>;
+template class IndexReplicasTemplate<IndexBinary>;
 
 } // namespace faiss
diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp
index 6b23315735..7ce838db5e 100644
--- a/faiss/IndexScalarQuantizer.cpp
+++ b/faiss/IndexScalarQuantizer.cpp
@@ -32,7 +32,8 @@ IndexScalarQuantizer::IndexScalarQuantizer(
         MetricType metric)
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
     is_trained = qtype == ScalarQuantizer::QT_fp16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct;
+            qtype == ScalarQuantizer::QT_8bit_direct ||
+            qtype == ScalarQuantizer::QT_bf16;
     code_size = sq.code_size;
 }
 
@@ -207,15 +208,15 @@ void IndexIVFScalarQuantizer::add_core(
         idx_t n,
         const float* x,
         const idx_t* xids,
-        const idx_t* coarse_idx) {
+        const idx_t* coarse_idx,
+        void* inverted_list_context) {
     FAISS_THROW_IF_NOT(is_trained);
 
-    size_t nadd = 0;
     std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
 
     DirectMapAdd dm_add(direct_map, n, xids);
 
-#pragma omp parallel reduction(+ : nadd)
+#pragma omp parallel
     {
         std::vector<float> residual(d);
         std::vector<uint8_t> one_code(code_size);
@@ -237,10 +238,10 @@ void IndexIVFScalarQuantizer::add_core(
                 memset(one_code.data(), 0, code_size);
                 squant->encode_vector(xi, one_code.data());
 
-                size_t ofs = invlists->add_entry(list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry(
+                        list_no, id, one_code.data(), inverted_list_context);
 
                 dm_add.add(i, list_no, ofs);
-                nadd++;
 
             } else if (rank == 0 && list_no == -1) {
                 dm_add.add(i, -1, 0);
diff --git a/faiss/IndexScalarQuantizer.h b/faiss/IndexScalarQuantizer.h
index c064bbeeb3..27332500c1 100644
--- a/faiss/IndexScalarQuantizer.h
+++ b/faiss/IndexScalarQuantizer.h
@@ -91,7 +91,8 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             idx_t n,
             const float* x,
             const idx_t* xids,
-            const idx_t* precomputed_idx) override;
+            const idx_t* precomputed_idx,
+            void* inverted_list_context = nullptr) override;
 
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
diff --git a/faiss/IndexShardsIVF.cpp b/faiss/IndexShardsIVF.cpp
index fa6272d297..33c8f0e3e6 100644
--- a/faiss/IndexShardsIVF.cpp
+++ b/faiss/IndexShardsIVF.cpp
@@ -137,7 +137,6 @@ void IndexShardsIVF::add_with_ids(
     auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
         idx_t i0 = (idx_t)no * n / nshard;
         idx_t i1 = ((idx_t)no + 1) * n / nshard;
-        const float* x0 = x + i0 * d;
         auto index_ivf = dynamic_cast<IndexIVF*>(index);
 
         if (index->verbose) {
diff --git a/faiss/MatrixStats.cpp b/faiss/MatrixStats.cpp
index cc87e1797c..f2f9a431e5 100644
--- a/faiss/MatrixStats.cpp
+++ b/faiss/MatrixStats.cpp
@@ -181,12 +181,12 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
 
         double max_std = 0, min_std = HUGE_VAL;
 
-        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+        size_t n_dangerous_range = 0, n_0_range = 0, n0_2 = 0;
 
         for (size_t j = 0; j < d; j++) {
             PerDimStats& st = per_dim_stats[j];
             st.compute_mean_std();
-            n0 += st.n0;
+            n0_2 += st.n0;
 
             if (st.max == st.min) {
                 n_0_range++;
@@ -200,12 +200,12 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
                 min_std = st.stddev;
         }
 
-        if (n0 == 0) {
+        if (n0_2 == 0) {
             do_comment("matrix contains no 0s\n");
         } else {
             do_comment(
                     "matrix contains %.2f %% 0 entries\n",
-                    n0 * 100.0 / (n * d));
+                    n0_2 * 100.0 / (n * d));
         }
 
         if (n_0_range == 0) {
diff --git a/faiss/MetricType.h b/faiss/MetricType.h
index 538b0a8e72..8e889b1a03 100644
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
@@ -31,8 +31,13 @@ enum MetricType {
     METRIC_Canberra = 20,
     METRIC_BrayCurtis,
     METRIC_JensenShannon,
-    METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
-                    ///< where a_i, b_i > 0
+
+    /// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
+    METRIC_Jaccard,
+    /// Squared Eucliden distance, ignoring NaNs
+    METRIC_NaNEuclidean,
+    /// abs(x | y): the distance to a hyperplane
+    METRIC_ABS_INNER_PRODUCT,
 };
 
 /// all vector indices are this type
diff --git a/faiss/clone_index.cpp b/faiss/clone_index.cpp
index 44ab1f7cc3..db16455f92 100644
--- a/faiss/clone_index.cpp
+++ b/faiss/clone_index.cpp
@@ -63,9 +63,10 @@ Index* clone_index(const Index* index) {
 // assumes there is a copy constructor ready. Always try from most
 // specific to most general. Most indexes don't have complicated
 // structs, the default copy constructor often just works.
-#define TRYCLONE(classname, obj)                                      \
-    if (const classname* clo = dynamic_cast<const classname*>(obj)) { \
-        return new classname(*clo);                                   \
+#define TRYCLONE(classname, obj)                       \
+    if (const classname* clo##classname =              \
+                dynamic_cast<const classname*>(obj)) { \
+        return new classname(*clo##classname);         \
     } else
 
 VectorTransform* Cloner::clone_VectorTransform(const VectorTransform* vt) {
@@ -237,13 +238,6 @@ Index* clone_AdditiveQuantizerIndex(const Index* index) {
 
 namespace {
 
-IndexHNSW* clone_HNSW(const IndexHNSW* ihnsw) {
-    TRYCLONE(IndexHNSWFlat, ihnsw)
-    TRYCLONE(IndexHNSWPQ, ihnsw)
-    TRYCLONE(IndexHNSWSQ, ihnsw)
-    return new IndexHNSW(*ihnsw);
-}
-
 InvertedLists* clone_InvertedLists(const InvertedLists* invlists) {
     if (auto* ails = dynamic_cast<const ArrayInvertedLists*>(invlists)) {
         return new ArrayInvertedLists(*ails);
diff --git a/faiss/cppcontrib/detail/UintReader.h b/faiss/cppcontrib/detail/UintReader.h
index 81e600f410..4a64a1a254 100644
--- a/faiss/cppcontrib/detail/UintReader.h
+++ b/faiss/cppcontrib/detail/UintReader.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/impl/platform_macros.h>
 #include <cstdint>
 
 namespace faiss {
@@ -31,7 +32,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 3) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32) >> 24;
+#else
                     return (code32 & 0x000000FF);
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -40,7 +45,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 2) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x00FF0000) >> 16;
+#else
                     return (code32 & 0x0000FF00) >> 8;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -49,7 +58,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 1) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x0000FF00) >> 8;
+#else
                     return (code32 & 0x00FF0000) >> 16;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -58,7 +71,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x000000FF);
+#else
                     return (code32) >> 24;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -87,40 +104,61 @@ struct Uint10Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000001111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000001111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111110000000000) >> 10;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111100) >> 2;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b00111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0011111111110000) >> 4;
                 }
             }
             case 3: {
-                const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                         codes + ELEMENT_TO_READ * 5 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                code16 = Swap2Bytes(code16);
+#endif
                 return (code16 & 0b1111111111000000) >> 6;
             }
         }
@@ -147,45 +185,69 @@ struct Uint12Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000111111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b111111111111000000000000) >> 12;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111111100000000) >> 8;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 3: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b11111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
@@ -208,23 +270,39 @@ struct Uint16Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0x0000FFFF);
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return code32 >> 16;
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
diff --git a/faiss/cppcontrib/sa_decode/Level2-inl.h b/faiss/cppcontrib/sa_decode/Level2-inl.h
index 36355af001..1eb7767ba8 100644
--- a/faiss/cppcontrib/sa_decode/Level2-inl.h
+++ b/faiss/cppcontrib/sa_decode/Level2-inl.h
@@ -12,10 +12,19 @@
 #include <cstdint>
 
 #include <faiss/cppcontrib/detail/CoarseBitType.h>
+#include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 namespace cppcontrib {
 
+bool isBigEndian() {
+#ifdef FAISS_BIG_ENDIAN
+    return true;
+#else
+    return false;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////////
 /// Index2LevelDecoder
 ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
 
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
-
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
                             COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 0ab6ff3cea..d20f3b7f8e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -238,21 +238,64 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
+          GpuIndexCagra.h
+          impl/RaftCagra.cuh
+          impl/RaftFlatIndex.cuh
           impl/RaftIVFFlat.cuh
-          impl/RaftFlatIndex.cuh)
+          impl/RaftIVFPQ.cuh
+          utils/RaftUtils.h)
   list(APPEND FAISS_GPU_SRC
+          GpuIndexCagra.cu
+          impl/RaftCagra.cu
           impl/RaftFlatIndex.cu
-	  impl/RaftIVFFlat.cu)
+          impl/RaftIVFFlat.cu
+          impl/RaftIVFPQ.cu
+          utils/RaftUtils.cu)
+endif()
+
+add_library(faiss_gpu STATIC ${FAISS_GPU_SRC})
+set_target_properties(faiss_gpu PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  WINDOWS_EXPORT_ALL_SYMBOLS ON
+)
+target_include_directories(faiss_gpu PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 
+if(FAISS_ENABLE_RAFT)
   target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAFT=1)
   target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAFT=1)
+  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAFT=1)
+
+  # Mark all functions as hidden so that we don't generate
+  # global 'public' functions that also exist in libraft.so
+  #
+  # This ensures that faiss functions will call the local version
+  # inside libfaiss.so . This is needed to ensure that things
+  # like raft cublas resources are created and used within the same
+  # dynamic library + CUDA runtime context which are requirements
+  # for valid execution
+  #
+  # To still allow these classes to be used by consumers, the
+  # respective classes/types in the headers are explicitly marked
+  # as 'public' so they can be used by consumers
+  set_source_files_properties(
+    GpuDistance.cu
+    StandardGpuResources.cpp
+    impl/RaftFlatIndex.cu
+    impl/RaftIVFFlat.cu
+    impl/RaftIVFPQ.cu
+    utils/RaftUtils.cu
+    TARGET_DIRECTORY faiss
+    PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
+  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
 set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 
-target_sources(faiss PRIVATE ${FAISS_GPU_SRC})
-target_sources(faiss_avx2 PRIVATE ${FAISS_GPU_SRC})
+target_link_libraries(faiss PRIVATE  "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
+target_link_libraries(faiss_avx2 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
+target_link_libraries(faiss_avx512 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
 
 foreach(header ${FAISS_GPU_HEADERS})
   get_filename_component(dir ${header} DIRECTORY )
@@ -274,11 +317,8 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 }
 ]=]
 )
-target_link_options(faiss PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-target_link_options(faiss_avx2 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>  $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
-target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 4dc51f9e83..b6d55a47aa 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,6 +14,9 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/IndexHNSW.h>
+#endif
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/GpuIndexCagra.h>
+#endif
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
 
-    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+        IndexHNSWCagra* res = new IndexHNSWCagra();
+        icg->copyTo(res);
+        return res;
+    }
+#endif
+    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
         Index* res = clone_Index(ish->at(0));
@@ -152,8 +166,8 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+        config.use_raft = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
                 provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -204,8 +218,9 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.useFloat16LookupTables = useFloat16;
         config.usePrecomputedTables = usePrecomputed;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+        config.use_raft = use_raft;
+        config.interleavedLayout = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
 
@@ -214,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         }
 
         return res;
-    } else {
-        // default: use CPU cloner
-        return Cloner::clone_Index(index);
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+        GpuIndexCagraConfig config;
+        config.device = device;
+        GpuIndexCagra* res =
+                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
+        res->copyFrom(icg);
+        return res;
+    }
+#endif
+    else {
+        // use CPU cloner for IDMap and PreTransform
+        auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
+        auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
+        if (index_idmap || index_pt) {
+            return Cloner::clone_Index(index);
+        }
+        FAISS_THROW_MSG("This index type is not implemented on GPU.");
     }
 }
 
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index fbde4c4ea4..e643e848fb 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -38,7 +38,17 @@ struct GpuClonerOptions {
     bool verbose = false;
 
     /// use the RAFT implementation
+#if defined USE_NVIDIA_RAFT
+    bool use_raft = true;
+#else
     bool use_raft = false;
+#endif
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 struct GpuMultipleClonerOptions : public GpuClonerOptions {
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index c363aa4bb8..38a62f03bb 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -31,7 +31,7 @@
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 
 #if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/impl/RaftUtils.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/error.hpp>
@@ -51,6 +51,17 @@ using namespace raft::distance;
 using namespace raft::neighbors;
 #endif
 
+bool should_use_raft(GpuDistanceParams args) {
+    cudaDeviceProp prop;
+    int dev = args.device >= 0 ? args.device : getCurrentDevice();
+    cudaGetDeviceProperties(&prop, dev);
+
+    if (prop.major < 7)
+        return false;
+
+    return args.use_raft;
+}
+
 template <typename T>
 void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     // Validate the input data
@@ -228,8 +239,8 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
 
 #if defined USE_NVIDIA_RAFT
     // Note: For now, RAFT bfknn requires queries and vectors to be same layout
-    if (args.use_raft && args.queriesRowMajor == args.vectorsRowMajor) {
-        DistanceType distance = faiss_to_raft(args.metric, false);
+    if (should_use_raft(args) && args.queriesRowMajor == args.vectorsRowMajor) {
+        DistanceType distance = metricFaissToRaft(args.metric, false);
 
         auto resImpl = prov->getResources();
         auto res = resImpl.get();
@@ -316,7 +327,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     int64_t,
                     raft::col_major>>
                     index_vec = {index.view()};
-            RAFT_LOG_INFO("Invoking flat bfknn");
+
             brute_force::knn(
                     handle,
                     index_vec,
@@ -343,15 +354,12 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     [] __device__(const float& a) { return powf(a, 2); });
         }
 
-        RAFT_LOG_INFO("Done.");
-
         handle.sync_stream();
-        RAFT_LOG_INFO("All synced.");
     } else
 #else
-    if (args.use_raft) {
+    if (should_use_raft(args)) {
         FAISS_THROW_IF_NOT_MSG(
-                !args.use_raft,
+                !should_use_raft(args),
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
 #endif
diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h
index c0dde7fd8c..17dbee617b 100644
--- a/faiss/gpu/GpuDistance.h
+++ b/faiss/gpu/GpuDistance.h
@@ -9,6 +9,7 @@
 
 #include <faiss/Index.h>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -106,9 +107,14 @@ struct GpuDistanceParams {
     int device = -1;
 
     /// Should the index dispatch down to RAFT?
+    /// TODO: change default to true if RAFT is enabled
     bool use_raft = false;
 };
 
+/// A function that determines whether RAFT should be used based on various
+/// conditions (such as unsupported architecture)
+bool should_use_raft(GpuDistanceParams args);
+
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
 /// neighbor searches on an externally-provided region of memory (e.g., from a
 /// pytorch tensor).
@@ -168,3 +174,4 @@ void bruteForceKnn(
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/GpuIcmEncoder.cu b/faiss/gpu/GpuIcmEncoder.cu
index 434fae9e36..8bd60f91b8 100644
--- a/faiss/gpu/GpuIcmEncoder.cu
+++ b/faiss/gpu/GpuIcmEncoder.cu
@@ -82,7 +82,7 @@ void GpuIcmEncoder::encode(
         size_t n,
         size_t ils_iters) const {
     size_t nshards = shards->size();
-    size_t shard_size = (n + nshards - 1) / nshards;
+    size_t base_shard_size = n / nshards;
 
     auto codebooks = lsq->codebooks.data();
     auto M = lsq->M;
@@ -94,8 +94,14 @@ void GpuIcmEncoder::encode(
 
     // split input data
     auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        size_t i0 = idx * shard_size;
-        size_t ni = std::min(shard_size, n - i0);
+        size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
+        size_t ni = base_shard_size;
+        if (ni < n % nshards) {
+            ++ni;
+        }
+        if (ni <= 0) { // only if n < nshards
+            return;
+        }
         auto xi = x + i0 * d;
         auto ci = codes + i0 * M;
         std::mt19937 geni(idx + seed); // different seed for each shard
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 89952b1121..d667ae1494 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -42,6 +42,16 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 // FIXME: parameterize based on algorithm need
 constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
+bool should_use_raft(GpuIndexConfig config_) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, config_.device);
+
+    if (prop.major < 7)
+        return false;
+
+    return config_.use_raft;
+}
+
 GpuIndex::GpuIndex(
         std::shared_ptr<GpuResources> resources,
         int dims,
@@ -64,7 +74,7 @@ GpuIndex::GpuIndex(
                     (config_.memorySpace == MemorySpace::Unified &&
                      getFullUnifiedMemSupport(config_.device)),
             "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
-            config.device);
+            config_.device);
 
     metric_arg = metricArg;
 
@@ -132,7 +142,8 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
     if (n > 0) {
         idx_t totalSize = n * this->d * sizeof(float);
 
-        if (totalSize > kAddPageSize || n > kAddVecSize) {
+        if (!should_use_raft(config_) &&
+            (totalSize > kAddPageSize || n > kAddVecSize)) {
             // How many vectors fit into kAddPageSize?
             idx_t maxNumVecsForPageSize =
                     kAddPageSize / (this->d * sizeof(float));
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 629a57583d..cc10f21589 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -38,9 +38,17 @@ struct GpuIndexConfig {
     MemorySpace memorySpace = MemorySpace::Device;
 
     /// Should the index dispatch down to RAFT?
+#if defined USE_NVIDIA_RAFT
+    bool use_raft = true;
+#else
     bool use_raft = false;
+#endif
 };
 
+/// A centralized function that determines whether RAFT should
+/// be used based on various conditions (such as unsupported architecture)
+bool should_use_raft(GpuIndexConfig config_);
+
 class GpuIndex : public faiss::Index {
    public:
     GpuIndex(
@@ -76,19 +84,14 @@ class GpuIndex : public faiss::Index {
 
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
-    void assign(
-            idx_t n,
-            const float* x,
-            idx_t* labels,
-            // faiss::Index has idx_t for k
-            idx_t k = 1) const override;
+    void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const override;
 
     /// `x`, `distances` and `labels` can be resident on the CPU or any
     /// GPU; copies are performed as needed
     void search(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
@@ -99,7 +102,6 @@ class GpuIndex : public faiss::Index {
     void search_and_reconstruct(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
new file mode 100644
index 0000000000..4ae56df10d
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -0,0 +1,274 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <cstddef>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+#include <optional>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexCagra::GpuIndexCagra(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexCagraConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
+          cagraConfig_(config) {
+    this->is_trained = false;
+}
+
+void GpuIndexCagra::train(idx_t n, const float* x) {
+    if (this->is_trained) {
+        FAISS_ASSERT(index_);
+        return;
+    }
+
+    FAISS_ASSERT(!index_);
+
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+            std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params =
+            std::nullopt;
+    if (cagraConfig_.ivf_pq_params != nullptr) {
+        ivf_pq_params =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+        ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
+        ivf_pq_params->kmeans_n_iters =
+                cagraConfig_.ivf_pq_params->kmeans_n_iters;
+        ivf_pq_params->kmeans_trainset_fraction =
+                cagraConfig_.ivf_pq_params->kmeans_trainset_fraction;
+        ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
+        ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
+        ivf_pq_params->codebook_kind =
+                static_cast<raft::neighbors::ivf_pq::codebook_gen>(
+                        cagraConfig_.ivf_pq_params->codebook_kind);
+        ivf_pq_params->force_random_rotation =
+                cagraConfig_.ivf_pq_params->force_random_rotation;
+        ivf_pq_params->conservative_memory_allocation =
+                cagraConfig_.ivf_pq_params->conservative_memory_allocation;
+    }
+    if (cagraConfig_.ivf_pq_search_params != nullptr) {
+        ivf_pq_search_params =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+        ivf_pq_search_params->n_probes =
+                cagraConfig_.ivf_pq_search_params->n_probes;
+        ivf_pq_search_params->lut_dtype =
+                cagraConfig_.ivf_pq_search_params->lut_dtype;
+        ivf_pq_search_params->preferred_shmem_carveout =
+                cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
+    }
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            cagraConfig_.intermediate_graph_degree,
+            cagraConfig_.graph_degree,
+            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+            cagraConfig_.nn_descent_niter,
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT,
+            ivf_pq_params,
+            ivf_pq_search_params);
+
+    index_->train(n, x);
+
+    this->is_trained = true;
+    this->ntotal = n;
+}
+
+bool GpuIndexCagra::addImplRequiresIDs_() const {
+    return false;
+};
+
+void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
+    FAISS_THROW_MSG("adding vectors is not supported by GpuIndexCagra.");
+};
+
+void GpuIndexCagra::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* search_params) const {
+    FAISS_ASSERT(this->is_trained && index_);
+    FAISS_ASSERT(n > 0);
+
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+
+    SearchParametersCagra* params;
+    if (search_params) {
+        params = dynamic_cast<SearchParametersCagra*>(
+                const_cast<SearchParameters*>(search_params));
+    } else {
+        params = new SearchParametersCagra{};
+    }
+
+    index_->search(
+            queries,
+            k,
+            outDistances,
+            outLabels,
+            params->max_queries,
+            params->itopk_size,
+            params->max_iterations,
+            static_cast<faiss::cagra_search_algo>(params->algo),
+            params->team_size,
+            params->search_width,
+            params->min_iterations,
+            params->thread_block_size,
+            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+            params->hashmap_min_bitlen,
+            params->hashmap_max_fill_rate,
+            params->num_random_samplings,
+            params->seed);
+
+    if (not search_params) {
+        delete params;
+    }
+}
+
+void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+    FAISS_ASSERT(index);
+
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
+    FAISS_ASSERT(base_index);
+    auto distances = base_index->get_xb();
+
+    auto hnsw = index->hnsw;
+    // copy level 0 to a dense knn graph matrix
+    std::vector<idx_t> knn_graph;
+    knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0));
+
+#pragma omp parallel for
+    for (size_t i = 0; i < index->ntotal; ++i) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            // knn_graph.push_back(hnsw.neighbors[j]);
+            knn_graph[i * hnsw.nb_neighbors(0) + (j - begin)] =
+                    hnsw.neighbors[j];
+        }
+    }
+
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            index->ntotal,
+            hnsw.nb_neighbors(0),
+            distances,
+            knn_graph.data(),
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT);
+
+    this->is_trained = true;
+}
+
+void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
+    FAISS_ASSERT(index_ && this->is_trained && index);
+
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
+    // This needs to be zeroed out as this implementation adds vectors to the
+    // cpuIndex instead of copying fields
+    index->ntotal = 0;
+
+    auto graph_degree = index_->get_knngraph_degree();
+    auto M = graph_degree / 2;
+    if (index->storage and index->own_fields) {
+        delete index->storage;
+    }
+
+    if (this->metric_type == METRIC_L2) {
+        index->storage = new IndexFlatL2(index->d);
+    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
+        index->storage = new IndexFlatIP(index->d);
+    }
+    index->own_fields = true;
+    index->keep_max_size_level0 = true;
+    index->hnsw.reset();
+    index->hnsw.assign_probas.clear();
+    index->hnsw.cum_nneighbor_per_level.clear();
+    index->hnsw.set_default_probas(M, 1.0 / log(M));
+
+    auto n_train = this->ntotal;
+    auto train_dataset = index_->get_training_dataset();
+
+    // turn off as level 0 is copied from CAGRA graph
+    index->init_level0 = false;
+    if (!index->base_level_only) {
+        index->add(n_train, train_dataset.data());
+    } else {
+        index->hnsw.prepare_level_tab(n_train, false);
+        index->storage->add(n_train, train_dataset.data());
+        index->ntotal = n_train;
+    }
+
+    auto graph = get_knngraph();
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n_train; i++) {
+        size_t begin, end;
+        index->hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            index->hnsw.neighbors[j] = graph[i * graph_degree + (j - begin)];
+        }
+    }
+
+    // turn back on to allow new vectors to be added to level 0
+    index->init_level0 = true;
+}
+
+void GpuIndexCagra::reset() {
+    DeviceScope scope(config_.device);
+
+    if (index_) {
+        index_->reset();
+        this->ntotal = 0;
+        this->is_trained = false;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
+std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
+    FAISS_ASSERT(index_ && this->is_trained);
+
+    return index_->get_knngraph();
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
new file mode 100644
index 0000000000..6ecee3ae03
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -0,0 +1,282 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+
+namespace faiss {
+struct IndexHNSWCagra;
+}
+
+namespace faiss {
+namespace gpu {
+
+class RaftCagra;
+
+enum class graph_build_algo {
+    /// Use IVF-PQ to build all-neighbors knn graph
+    IVF_PQ,
+    /// Experimental, use NN-Descent to build all-neighbors knn graph
+    NN_DESCENT
+};
+
+/// A type for specifying how PQ codebooks are created.
+enum class codebook_gen { // NOLINT
+    PER_SUBSPACE = 0,     // NOLINT
+    PER_CLUSTER = 1,      // NOLINT
+};
+
+struct IVFPQBuildCagraConfig {
+    ///
+    /// The number of inverted lists (clusters)
+    ///
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
+    /// approximately 1,000 to 10,000.
+
+    uint32_t n_lists = 1024;
+    /// The number of iterations searching for kmeans centers (index building).
+    uint32_t kmeans_n_iters = 20;
+    /// The fraction of data to use during iterative kmeans building.
+    double kmeans_trainset_fraction = 0.5;
+    ///
+    /// The bit length of the vector element after compression by PQ.
+    ///
+    /// Possible values: [4, 5, 6, 7, 8].
+    ///
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
+    /// better the search performance, but the lower the recall.
+
+    uint32_t pq_bits = 8;
+    ///
+    /// The dimensionality of the vector after compression by PQ. When zero, an
+    /// optimal value is selected using a heuristic.
+    ///
+    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
+    ///
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
+    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
+    /// set to any number, but multiple of 8 are desirable for good performance.
+    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
+    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+    /// 'pq_dim' should be also a divisor of the dataset dim.
+
+    uint32_t pq_dim = 0;
+    /// How PQ codebooks are created.
+    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+    ///
+    /// Apply a random rotation matrix on the input data and queries even if
+    /// `dim % pq_dim == 0`.
+    ///
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
+    /// applied to the input data and queries to transform the working space
+    /// from `dim` to `rot_dim`, which may be slightly larger than the original
+    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of
+    /// `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
+    ///   features).
+    ///
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized
+    /// with the identity matrix. When `force_random_rotation == true`, a random
+    /// orthogonal transform matrix is generated regardless of the values of
+    /// `dim` and `pq_dim`.
+
+    bool force_random_rotation = false;
+    ///
+    /// By default, the algorithm allocates more space than necessary for
+    /// individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and
+    /// reduce the number of data copies during repeated calls to `extend`
+    /// (extending the database).
+    ///
+    /// The alternative is the conservative allocation behavior; when enabled,
+    /// the algorithm always allocates the minimum amount of memory required to
+    /// store the given number of records. Set this flag to `true` if you prefer
+    /// to use as little GPU memory for the database as possible.
+
+    bool conservative_memory_allocation = false;
+};
+
+struct IVFPQSearchCagraConfig {
+    /// The number of clusters to search.
+    uint32_t n_probes = 20;
+    ///
+    /// Data type of look up table to be created dynamically at search time.
+    ///
+    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+    ///
+    /// The use of low-precision types reduces the amount of shared memory
+    /// required at search time, so fast shared memory kernels can be used even
+    /// for datasets with large dimansionality. Note that the recall is slightly
+    /// degraded when low-precision type is selected.
+
+    cudaDataType_t lut_dtype = CUDA_R_32F;
+    ///
+    /// Storage data type for distance/similarity computed at search time.
+    ///
+    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+    ///
+    /// If the performance limiter at search time is device memory access,
+    /// selecting FP16 will improve performance slightly.
+
+    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+    ///
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as
+    /// shared memory.
+    ///
+    /// Possible values: [0.0 - 1.0] as a fraction of the
+    /// `sharedMemPerMultiprocessor`.
+    ///
+    /// One wants to increase the carveout to make sure a good GPU occupancy for
+    /// the main search kernel, but not to keep it too high to leave some memory
+    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
+    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
+    /// so the provided value is rounded up to the nearest configuration. Refer
+    /// to the NVIDIA tuning guide for the target GPU architecture.
+    ///
+    /// Note, this is a low-level tuning parameter that can have drastic
+    /// negative effects on the search performance if tweaked incorrectly.
+
+    double preferred_shmem_carveout = 1.0;
+};
+
+struct GpuIndexCagraConfig : public GpuIndexConfig {
+    /// Degree of input graph for pruning.
+    size_t intermediate_graph_degree = 128;
+    /// Degree of output graph.
+    size_t graph_degree = 64;
+    /// ANN algorithm to build knn graph.
+    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+    /// Number of Iterations to run if building with NN_DESCENT
+    size_t nn_descent_niter = 20;
+
+    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+};
+
+enum class search_algo {
+    /// For large batch sizes.
+    SINGLE_CTA,
+    /// For small batch sizes.
+    MULTI_CTA,
+    MULTI_KERNEL,
+    AUTO
+};
+
+enum class hash_mode { HASH, SMALL, AUTO };
+
+struct SearchParametersCagra : SearchParameters {
+    /// Maximum number of queries to search at the same time (batch size). Auto
+    /// select when 0.
+    size_t max_queries = 0;
+
+    /// Number of intermediate search results retained during the search.
+    ///
+    ///  This is the main knob to adjust trade off between accuracy and search
+    /// speed. Higher values improve the search accuracy.
+
+    size_t itopk_size = 64;
+
+    /// Upper limit of search iterations. Auto select when 0.
+    size_t max_iterations = 0;
+
+    // In the following we list additional search parameters for fine tuning.
+    // Reasonable default values are automatically chosen.
+
+    /// Which search implementation to use.
+    search_algo algo = search_algo::AUTO;
+
+    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+
+    size_t team_size = 0;
+
+    /// Number of graph nodes to select as the starting point for the search in
+    /// each iteration. aka search width?
+    size_t search_width = 1;
+    /// Lower limit of search iterations.
+    size_t min_iterations = 0;
+
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
+    size_t thread_block_size = 0;
+    /// Hashmap type. Auto selection when AUTO.
+    hash_mode hashmap_mode = hash_mode::AUTO;
+    /// Lower limit of hashmap bit length. More than 8.
+    size_t hashmap_min_bitlen = 0;
+    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    float hashmap_max_fill_rate = 0.5;
+
+    /// Number of iterations of initial random seed node selection. 1 or more.
+
+    uint32_t num_random_samplings = 1;
+    /// Bit mask used for initial random seed node selection.
+    uint64_t seed = 0x128394;
+};
+
+struct GpuIndexCagra : public GpuIndex {
+   public:
+    GpuIndexCagra(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexCagraConfig config = GpuIndexCagraConfig());
+
+    /// Trains CAGRA based on the given vector data
+    void train(idx_t n, const float* x) override;
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexHNSWCagra* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexHNSWCagra* index) const;
+
+    void reset() override;
+
+    std::vector<idx_t> get_knngraph() const;
+
+   protected:
+    bool addImplRequiresIDs_() const override;
+
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            idx_t n,
+            const float* x,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
+
+    /// Our configuration options
+    const GpuIndexCagraConfig cagraConfig_;
+
+    /// Instance that we own; contains the inverted lists
+    std::shared_ptr<RaftCagra> index_;
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index ef5757fbbd..d361a7182a 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -6,18 +6,22 @@
  */
 
 #include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/impl/IndexUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/RaftFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 #include <limits>
 
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/impl/RaftFlatIndex.cuh>
+#endif
+
 namespace faiss {
 namespace gpu {
 
@@ -91,7 +95,7 @@ GpuIndexFlat::~GpuIndexFlat() {}
 void GpuIndexFlat::resetIndex_(int dims) {
 #if defined USE_NVIDIA_RAFT
 
-    if (flatConfig_.use_raft) {
+    if (should_use_raft(config_)) {
         data_.reset(new RaftFlatIndex(
                 resources_.get(),
                 dims,
@@ -99,7 +103,7 @@ void GpuIndexFlat::resetIndex_(int dims) {
                 config_.memorySpace));
     } else
 #else
-    if (flatConfig_.use_raft) {
+    if (should_use_raft(config_)) {
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index c83008307d..40129a54c5 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -7,6 +7,7 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
 #include <faiss/gpu/GpuCloner.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
@@ -16,11 +17,6 @@
 #include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
-#if defined USE_NVIDIA_RAFT
-#include <raft/core/handle.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
-#endif
-
 namespace faiss {
 namespace gpu {
 
@@ -79,9 +75,9 @@ void GpuIndexIVF::init_() {
     }
 
     // here we set a low # iterations because this is typically used
-    // for large clusterings
-    // (copying IndexIVF.cpp's Level1Quantizer
+    // for large clusterings (copying IndexIVF.cpp's Level1Quantizer
     cp.niter = 10;
+
     cp.verbose = verbose;
 
     if (quantizer) {
@@ -96,6 +92,7 @@ void GpuIndexIVF::init_() {
         GpuIndexFlatConfig config = ivfConfig_.flatConfig;
         // inherit our same device
         config.device = config_.device;
+        config.use_raft = config_.use_raft;
 
         if (metric_type == faiss::METRIC_L2) {
             quantizer = new GpuIndexFlatL2(resources_, d, config);
@@ -176,10 +173,29 @@ void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
         // over to the GPU, on the same device that we are on.
         GpuResourcesProviderFromInstance pfi(getResources());
 
-        GpuClonerOptions options;
-        auto cloner = ToGpuCloner(&pfi, getDevice(), options);
-
-        quantizer = cloner.clone_Index(index->quantizer);
+        // Attempt to clone the index to GPU. If it fails because the coarse
+        // quantizer is not implemented on GPU and the flag to allow CPU
+        // fallback is set, retry it with CPU cloner and re-throw errors.
+        try {
+            GpuClonerOptions options;
+            auto cloner = ToGpuCloner(&pfi, getDevice(), options);
+            quantizer = cloner.clone_Index(index->quantizer);
+        } catch (const std::exception& e) {
+            if (strstr(e.what(), "not implemented on GPU")) {
+                if (ivfConfig_.allowCpuCoarseQuantizer) {
+                    Cloner cpuCloner;
+                    quantizer = cpuCloner.clone_Index(index->quantizer);
+                } else {
+                    FAISS_THROW_MSG(
+                            "This index type is not implemented on "
+                            "GPU and allowCpuCoarseQuantizer is set to false. "
+                            "Please set the flag to true to allow the CPU "
+                            "fallback in cloning.");
+                }
+            } else {
+                throw;
+            }
+        }
         own_fields = true;
     } else {
         // Otherwise, this is a GPU coarse quantizer index instance found in a
@@ -451,43 +467,12 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
 
     quantizer->reset();
 
-#if defined USE_NVIDIA_RAFT
-
-    if (config_.use_raft) {
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
 
-        raft::neighbors::ivf_flat::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = metric_type == faiss::METRIC_L2
-                ? raft::distance::DistanceType::L2Expanded
-                : raft::distance::DistanceType::InnerProduct;
-        raft_idx_params.add_data_on_build = false;
-        raft_idx_params.kmeans_trainset_fraction = 1.0;
-        raft_idx_params.kmeans_n_iters = cp.niter;
-        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
-
-        auto raft_index = raft::neighbors::ivf_flat::build(
-                raft_handle, raft_idx_params, x, n, (idx_t)d);
-
-        raft_handle.sync_stream();
-
-        quantizer->train(nlist, raft_index.centers().data_handle());
-        quantizer->add(nlist, raft_index.centers().data_handle());
-    } else
-#else
-    if (config_.use_raft) {
-        FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-    {
-        // leverage the CPU-side k-means code, which works for the GPU
-        // flat index as well
-        Clustering clus(this->d, nlist, this->cp);
-        clus.verbose = verbose;
-        clus.train(n, x, *quantizer);
-    }
     quantizer->is_trained = true;
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index a9f092d35b..65a27aa94e 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
 
     /// Configuration for the coarse quantizer object
     GpuIndexFlatConfig flatConfig;
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 /// Base class of all GPU IVF index types. This (for now) deliberately does not
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 750096e153..884b5b0fc0 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -16,7 +16,9 @@
 #include <faiss/gpu/utils/Float16.cuh>
 
 #if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #endif
 
 #include <limits>
@@ -70,11 +72,14 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
                   config),
           ivfFlatConfig_(config),
           reserveMemoryVecs_(0) {
+    FAISS_THROW_IF_NOT_MSG(
+            !should_use_raft(config),
+            "GpuIndexIVFFlat: RAFT does not support separate coarseQuantizer");
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
     if (this->is_trained) {
         FAISS_ASSERT(this->quantizer);
-        set_index_(
+        setIndex_(
                 resources_.get(),
                 this->d,
                 this->nlist,
@@ -92,56 +97,13 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
 
 GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
 
-void GpuIndexIVFFlat::set_index_(
-        GpuResources* resources,
-        int dim,
-        int nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        /// Optional ScalarQuantizer
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space) {
-#if defined USE_NVIDIA_RAFT
+void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
+    DeviceScope scope(config_.device);
 
-    if (config_.use_raft) {
-        index_.reset(new RaftIVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
-    } else
-#else
-    if (config_.use_raft) {
+    if (should_use_raft(config_)) {
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-    {
-        index_.reset(new IVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
+                "Pre-allocation of IVF lists is not supported with RAFT enabled.");
     }
-}
-
-void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
-    DeviceScope scope(config_.device);
 
     reserveMemoryVecs_ = numVecs;
     if (index_) {
@@ -157,7 +119,11 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     // Clear out our old data
     index_.reset();
-    baseIndex_.reset();
+
+    // skip base class allocations if RAFT is enabled
+    if (!should_use_raft(config_)) {
+        baseIndex_.reset();
+    }
 
     // The other index might not be trained
     if (!index->is_trained) {
@@ -169,7 +135,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     FAISS_ASSERT(is_trained);
 
     // Copy our lists as well
-    set_index_(
+    setIndex_(
             resources_.get(),
             d,
             nlist,
@@ -247,23 +213,61 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
+        if (should_use_raft(config_)) {
+            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
+            // case it has been reset. This is because reset clears the RAFT
+            // index and its centroids.
+            // TODO: change this once the coarse quantizer is separated from
+            // RAFT index
+            updateQuantizer();
+        };
         return;
     }
 
     FAISS_ASSERT(!index_);
 
+    if (should_use_raft(config_)) {
 #if defined USE_NVIDIA_RAFT
-    if (config_.use_raft) {
-        // No need to copy the data to host
-        trainQuantizer_(n, x);
-    } else
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                this->metric_type,
+                this->metric_arg,
+                false,   // no residual
+                nullptr, // no scalar quantizer
+                ivfFlatConfig_.interleavedLayout,
+                ivfFlatConfig_.indicesOptions,
+                config_.memorySpace);
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
+        raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_trainset_fraction =
+                static_cast<double>(cp.max_points_per_centroid * nlist) /
+                static_cast<double>(n);
+        raft_idx_params.kmeans_n_iters = cp.niter;
+
+        auto raftIndex_ =
+                std::static_pointer_cast<RaftIVFFlat, IVFFlat>(index_);
+
+        raft::neighbors::ivf_flat::index<float, idx_t> raft_ivfflat_index =
+                raft::neighbors::ivf_flat::build<float, idx_t>(
+                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+
+        quantizer->train(nlist, raft_ivfflat_index.centers().data_handle());
+        quantizer->add(nlist, raft_ivfflat_index.centers().data_handle());
+        raft_handle.sync_stream();
+
+        raftIndex_->setRaftIndex(std::move(raft_ivfflat_index));
 #else
-    if (config_.use_raft) {
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
 #endif
-    {
+    } else {
         // FIXME: GPUize more of this
         // First, make sure that the data is resident on the CPU, if it is not
         // on the CPU, as we depend upon parts of the CPU code
@@ -272,29 +276,107 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
                 resources_->getDefaultStream(config_.device),
                 {n, this->d});
         trainQuantizer_(n, hostData.data());
+
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                this->metric_type,
+                this->metric_arg,
+                false,   // no residual
+                nullptr, // no scalar quantizer
+                ivfFlatConfig_.interleavedLayout,
+                ivfFlatConfig_.indicesOptions,
+                config_.memorySpace);
+        updateQuantizer();
     }
 
     // The quantizer is now trained; construct the IVF index
-    set_index_(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            this->metric_type,
-            this->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace);
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
 
     if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
+        if (should_use_raft(config_)) {
+            FAISS_THROW_MSG(
+                    "Pre-allocation of IVF lists is not supported with RAFT enabled.");
+        } else
+            index_->reserveMemory(reserveMemoryVecs_);
     }
 
     this->is_trained = true;
 }
 
+void GpuIndexIVFFlat::setIndex_(
+        GpuResources* resources,
+        int dim,
+        int nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        /// Optional ScalarQuantizer
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space) {
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        FAISS_THROW_IF_NOT_MSG(
+                ivfFlatConfig_.indicesOptions == INDICES_64_BIT,
+                "RAFT only supports INDICES_64_BIT");
+        if (!ivfFlatConfig_.interleavedLayout) {
+            fprintf(stderr,
+                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+        }
+        index_.reset(new RaftIVFFlat(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        index_.reset(new IVFFlat(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
+    }
+}
+
+void GpuIndexIVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) const {
+    FAISS_ASSERT(index_);
+
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    FAISS_THROW_IF_NOT_FMT(
+            i0 < this->ntotal,
+            "start index (%zu) out of bounds (ntotal %zu)",
+            i0,
+            this->ntotal);
+    FAISS_THROW_IF_NOT_FMT(
+            i0 + ni - 1 < this->ntotal,
+            "max index requested (%zu) out of bounds (ntotal %zu)",
+            i0 + ni - 1,
+            this->ntotal);
+
+    index_->reconstruct_n(i0, ni, out);
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index d7508feef4..1401e2b291 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -87,8 +87,11 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     /// Trains the coarse quantizer based on the given vector data
     void train(idx_t n, const float* x) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
+
    protected:
-    void set_index_(
+    /// Initialize appropriate index
+    void setIndex_(
             GpuResources* resources,
             int dim,
             int nlist,
@@ -101,6 +104,7 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
             IndicesOptions indicesOptions,
             MemorySpace space);
 
+   protected:
     /// Our configuration options
     const GpuIndexIVFFlatConfig ivfFlatConfig_;
 
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 69c4cf0556..d39f036b89 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -15,6 +15,13 @@
 #include <faiss/gpu/impl/IVFPQ.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/impl/RaftIVFPQ.cuh>
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
+#endif
+
 #include <limits>
 
 namespace faiss {
@@ -87,6 +94,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(
     // instance
     this->is_trained = false;
 
+    FAISS_THROW_IF_NOT_MSG(
+            !config.use_raft,
+            "GpuIndexIVFPQ: RAFT does not support separate coarseQuantizer");
+
     verifyPQSettings_();
 }
 
@@ -100,7 +111,11 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
 
     // Clear out our old data
     index_.reset();
-    baseIndex_.reset();
+
+    // skip base class allocations if RAFT is enabled
+    if (!should_use_raft(config_)) {
+        baseIndex_.reset();
+    }
 
     pq = index->pq;
     subQuantizers_ = index->pq.M;
@@ -127,7 +142,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
     // Copy our lists as well
     // The product quantizer must have data in it
     FAISS_ASSERT(index->pq.centroids.size() > 0);
-    index_.reset(new IVFPQ(
+    setIndex_(
             resources_.get(),
             this->d,
             this->nlist,
@@ -140,7 +155,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
             ivfpqConfig_.interleavedLayout,
             (float*)index->pq.centroids.data(),
             ivfpqConfig_.indicesOptions,
-            config_.memorySpace));
+            config_.memorySpace);
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
 
     // Doesn't make sense to reserve memory here
@@ -169,7 +184,7 @@ void GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
     //
     index->by_residual = true;
     index->use_precomputed_table = 0;
-    index->code_size = subQuantizers_;
+    index->code_size = utils::divUp(subQuantizers_ * bitsPerCode_, 8);
     index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
 
     index->do_polysemous_training = false;
@@ -308,6 +323,7 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
         try {
             GpuIndexFlatConfig config;
             config.device = ivfpqConfig_.device;
+            config.use_raft = false;
             GpuIndexFlatL2 pqIndex(resources_, pq.dsub, config);
 
             pq.assign_index = &pqIndex;
@@ -322,29 +338,6 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
         // use the currently assigned clustering index
         pq.train(n, residuals.data());
     }
-
-    index_.reset(new IVFPQ(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            metric_type,
-            metric_arg,
-            subQuantizers_,
-            bitsPerCode_,
-            ivfpqConfig_.useFloat16LookupTables,
-            ivfpqConfig_.useMMCodeDistance,
-            ivfpqConfig_.interleavedLayout,
-            pq.centroids.data(),
-            ivfpqConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
-    updateQuantizer();
-
-    if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
 }
 
 void GpuIndexIVFPQ::train(idx_t n, const float* x) {
@@ -356,27 +349,179 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
+        if (should_use_raft(config_)) {
+            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
+            // case it has been reset. This is because reset clears the RAFT
+            // index and its centroids.
+            // TODO: change this once the coarse quantizer is separated from
+            // RAFT index
+            updateQuantizer();
+        };
         return;
     }
 
     FAISS_ASSERT(!index_);
 
-    // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
-    auto hostData = toHost<float, 2>(
-            (float*)x,
-            resources_->getDefaultStream(config_.device),
-            {n, this->d});
+    // RAFT does not support using an external index for assignment. Fall back
+    // to the classical GPU impl
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        if (pq.assign_index) {
+            fprintf(stderr,
+                    "WARN: The Product Quantizer's assign_index will be ignored with RAFT enabled.\n");
+        }
+        // first initialize the index. The PQ centroids will be updated
+        // retroactively.
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                metric_type,
+                metric_arg,
+                subQuantizers_,
+                bitsPerCode_,
+                ivfpqConfig_.useFloat16LookupTables,
+                ivfpqConfig_.useMMCodeDistance,
+                ivfpqConfig_.interleavedLayout,
+                pq.centroids.data(),
+                ivfpqConfig_.indicesOptions,
+                config_.memorySpace);
+        // No need to copy the data to host
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_pq::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
+        raft_idx_params.kmeans_trainset_fraction =
+                static_cast<double>(cp.max_points_per_centroid * nlist) /
+                static_cast<double>(n);
+        raft_idx_params.kmeans_n_iters = cp.niter;
+        raft_idx_params.pq_bits = bitsPerCode_;
+        raft_idx_params.pq_dim = subQuantizers_;
+        raft_idx_params.conservative_memory_allocation = false;
+        raft_idx_params.add_data_on_build = false;
+
+        auto raftIndex_ = std::static_pointer_cast<RaftIVFPQ, IVFPQ>(index_);
+
+        raft::neighbors::ivf_pq::index<idx_t> raft_ivfpq_index =
+                raft::neighbors::ivf_pq::build<float, idx_t>(
+                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+
+        auto raft_centers = raft::make_device_matrix<float>(
+                raft_handle,
+                raft_ivfpq_index.n_lists(),
+                raft_ivfpq_index.dim());
+        raft::neighbors::ivf_pq::helpers::extract_centers(
+                raft_handle, raft_ivfpq_index, raft_centers.view());
+
+        quantizer->train(nlist, raft_centers.data_handle());
+        quantizer->add(nlist, raft_centers.data_handle());
+
+        raft::copy(
+                pq.get_centroids(0, 0),
+                raft_ivfpq_index.pq_centers().data_handle(),
+                raft_ivfpq_index.pq_centers().size(),
+                raft_handle.get_stream());
+        raft_handle.sync_stream();
+        raftIndex_->setRaftIndex(std::move(raft_ivfpq_index));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        // FIXME: GPUize more of this
+        // First, make sure that the data is resident on the CPU, if it is not
+        // on the CPU, as we depend upon parts of the CPU code
+        auto hostData = toHost<float, 2>(
+                (float*)x,
+                resources_->getDefaultStream(config_.device),
+                {n, this->d});
+
+        trainQuantizer_(n, hostData.data());
+        trainResidualQuantizer_(n, hostData.data());
+
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                metric_type,
+                metric_arg,
+                subQuantizers_,
+                bitsPerCode_,
+                ivfpqConfig_.useFloat16LookupTables,
+                ivfpqConfig_.useMMCodeDistance,
+                ivfpqConfig_.interleavedLayout,
+                pq.centroids.data(),
+                ivfpqConfig_.indicesOptions,
+                config_.memorySpace);
+        updateQuantizer();
+    }
+    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
 
-    trainQuantizer_(n, hostData.data());
-    trainResidualQuantizer_(n, hostData.data());
+    if (reserveMemoryVecs_) {
+        index_->reserveMemory(reserveMemoryVecs_);
+    }
+
+    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
 
     FAISS_ASSERT(index_);
 
     this->is_trained = true;
 }
 
+void GpuIndexIVFPQ::setIndex_(
+        GpuResources* resources,
+        int dim,
+        idx_t nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        int numSubQuantizers,
+        int bitsPerSubQuantizer,
+        bool useFloat16LookupTables,
+        bool useMMCodeDistance,
+        bool interleavedLayout,
+        float* pqCentroidData,
+        IndicesOptions indicesOptions,
+        MemorySpace space) {
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        index_.reset(new RaftIVFPQ(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                pqCentroidData,
+                indicesOptions,
+                space));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        index_.reset(new IVFPQ(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                pqCentroidData,
+                indicesOptions,
+                space));
+    }
+}
+
 void GpuIndexIVFPQ::verifyPQSettings_() const {
     // Our implementation has these restrictions:
 
@@ -384,28 +529,36 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
     FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
 
     // up to a single byte per code
-    if (ivfpqConfig_.interleavedLayout) {
+    if (should_use_raft(config_)) {
+        if (!ivfpqConfig_.interleavedLayout) {
+            fprintf(stderr,
+                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+        }
         FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 ||
-                        bitsPerCode_ == 8,
-                "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
+                bitsPerCode_ >= 4 && bitsPerCode_ <= 8,
+                "Bits per code must be within closed range [4,8] (passed %d)",
                 bitsPerCode_);
-
-    } else {
         FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 8,
-                "Bits per code must be 8 (passed %d)",
-                bitsPerCode_);
+                (bitsPerCode_ * subQuantizers_) % 8 == 0,
+                "`Bits per code * number of sub-quantizers must be a multiple of 8, (passed %u * %u = %u).",
+                bitsPerCode_,
+                subQuantizers_,
+                bitsPerCode_ * subQuantizers_);
+    } else {
+        if (ivfpqConfig_.interleavedLayout) {
+            FAISS_THROW_IF_NOT_FMT(
+                    bitsPerCode_ == 4 || bitsPerCode_ == 5 ||
+                            bitsPerCode_ == 6 || bitsPerCode_ == 8,
+                    "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
+                    bitsPerCode_);
+        } else {
+            FAISS_THROW_IF_NOT_FMT(
+                    bitsPerCode_ == 8,
+                    "Bits per code must be 8 (passed %d)",
+                    bitsPerCode_);
+        }
     }
 
-    // Sub-quantizers must evenly divide dimensions available
-    FAISS_THROW_IF_NOT_FMT(
-            this->d % subQuantizers_ == 0,
-            "Number of sub-quantizers (%d) must be an "
-            "even divisor of the number of dimensions (%d)",
-            subQuantizers_,
-            this->d);
-
     // The number of bytes per encoded vector must be one we support
     FAISS_THROW_IF_NOT_FMT(
             ivfpqConfig_.interleavedLayout ||
@@ -414,30 +567,40 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
             "is not supported",
             subQuantizers_);
 
-    // We must have enough shared memory on the current device to store
-    // our lookup distances
-    int lookupTableSize = sizeof(float);
-    if (ivfpqConfig_.useFloat16LookupTables) {
-        lookupTableSize = sizeof(half);
-    }
+    if (!should_use_raft(config_)) {
+        // Sub-quantizers must evenly divide dimensions available
+        FAISS_THROW_IF_NOT_FMT(
+                this->d % subQuantizers_ == 0,
+                "Number of sub-quantizers (%d) must be an "
+                "even divisor of the number of dimensions (%d)",
+                subQuantizers_,
+                this->d);
+
+        // We must have enough shared memory on the current device to store
+        // our lookup distances
+        int lookupTableSize = sizeof(float);
+        if (ivfpqConfig_.useFloat16LookupTables) {
+            lookupTableSize = sizeof(half);
+        }
 
-    // 64 bytes per code is only supported with usage of float16, at 2^8
-    // codes per subquantizer
-    size_t requiredSmemSize =
-            lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
-    size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
+        // 64 bytes per code is only supported with usage of float16, at 2^8
+        // codes per subquantizer
+        size_t requiredSmemSize =
+                lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
+        size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
 
-    FAISS_THROW_IF_NOT_FMT(
-            requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
-            "Device %d has %zu bytes of shared memory, while "
-            "%d bits per code and %d sub-quantizers requires %zu "
-            "bytes. Consider useFloat16LookupTables and/or "
-            "reduce parameters",
-            config_.device,
-            smemPerBlock,
-            bitsPerCode_,
-            subQuantizers_,
-            requiredSmemSize);
+        FAISS_THROW_IF_NOT_FMT(
+                requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
+                "Device %d has %zu bytes of shared memory, while "
+                "%d bits per code and %d sub-quantizers requires %zu "
+                "bytes. Consider useFloat16LookupTables and/or "
+                "reduce parameters",
+                config_.device,
+                smemPerBlock,
+                bitsPerCode_,
+                subQuantizers_,
+                requiredSmemSize);
+    }
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h
index 22e9961675..1084d4d0d2 100644
--- a/faiss/gpu/GpuIndexIVFPQ.h
+++ b/faiss/gpu/GpuIndexIVFPQ.h
@@ -33,7 +33,8 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
     bool usePrecomputedTables = false;
 
     /// Use the alternative memory layout for the IVF lists
-    /// WARNING: this is a feature under development, do not use!
+    /// WARNING: this is a feature under development, and is only supported with
+    /// RAFT enabled for the index. Do not use if RAFT is not enabled.
     bool interleavedLayout = false;
 
     /// Use GEMM-backed computation of PQ code distances for the no precomputed
@@ -133,6 +134,22 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
     ProductQuantizer pq;
 
    protected:
+    /// Initialize appropriate index
+    void setIndex_(
+            GpuResources* resources,
+            int dim,
+            idx_t nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            int numSubQuantizers,
+            int bitsPerSubQuantizer,
+            bool useFloat16LookupTables,
+            bool useMMCodeDistance,
+            bool interleavedLayout,
+            float* pqCentroidData,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
     /// Throws errors if configuration settings are improper
     void verifyPQSettings_() const;
 
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 7d0459955b..fc6dd591b4 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -32,6 +32,7 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #endif
 
 namespace faiss {
@@ -159,6 +160,10 @@ struct AllocRequest : public AllocInfo {
 
     /// The size in bytes of the allocation
     size_t size = 0;
+
+#if defined USE_NVIDIA_RAFT
+    rmm::mr::device_memory_resource* mr = nullptr;
+#endif
 };
 
 /// A RAII object that manages a temporary memory request
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 754025d049..78336b4994 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -22,11 +22,10 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <memory>
-
 #endif
 
 #include <faiss/gpu/StandardGpuResources.h>
@@ -92,9 +91,8 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
 #if defined USE_NVIDIA_RAFT
-          cmr(new rmm::mr::cuda_memory_resource),
-          mmr(new rmm::mr::managed_memory_resource),
-          pmr(new rmm::mr::pinned_memory_resource),
+          mmr_(new rmm::mr::managed_memory_resource),
+          pmr_(new rmm::mr::pinned_memory_resource),
 #endif
           pinnedMemAlloc_(nullptr),
           pinnedMemAllocSize_(0),
@@ -161,7 +159,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     if (pinnedMemAlloc_) {
 #if defined USE_NVIDIA_RAFT
-        pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
+        pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
         FAISS_ASSERT_FMT(
@@ -259,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_[device] = stream;
@@ -277,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_.erase(device);
@@ -314,7 +328,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
             try {
-                pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_);
+                pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
             } catch (const std::bad_alloc& rmm_ex) {
                 FAISS_THROW_MSG("CUDA memory allocation error");
             }
@@ -478,8 +492,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
     void* p = nullptr;
 
     if (adjReq.space == MemorySpace::Temporary) {
-        // If we don't have enough space in our temporary memory manager, we
-        // need to allocate this request separately
         auto& tempMem = tempMemory_[adjReq.device];
 
         if (adjReq.size > tempMem->getSizeAvailable()) {
@@ -500,11 +512,14 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
 
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-
     } else if (adjReq.space == MemorySpace::Device) {
 #if defined USE_NVIDIA_RAFT
         try {
-            p = cmr->allocate(adjReq.size, adjReq.stream);
+            rmm::mr::device_memory_resource* current_mr =
+                    rmm::mr::get_per_device_resource(
+                            rmm::cuda_device_id{adjReq.device});
+            p = current_mr->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = current_mr;
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
         }
@@ -514,8 +529,8 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Throw if we fail to allocate
         if (err != cudaSuccess) {
             // FIXME: as of CUDA 11, a memory allocation error appears to be
-            // presented via cudaGetLastError as well, and needs to be cleared.
-            // Just call the function to clear it
+            // presented via cudaGetLastError as well, and needs to be
+            // cleared. Just call the function to clear it
             cudaGetLastError();
 
             std::stringstream ss;
@@ -534,7 +549,12 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
     } else if (adjReq.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_RAFT
         try {
-            p = mmr->allocate(adjReq.size, adjReq.stream);
+            // for now, use our own managed MR to do Unified Memory allocations.
+            // TODO: change this to use the current device resource once RMM has
+            // a way to retrieve a "guaranteed" managed memory resource for a
+            // device.
+            p = mmr_->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = mmr_.get();
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
         }
@@ -593,16 +613,11 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
 
     if (req.space == MemorySpace::Temporary) {
         tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_RAFT
-        if (req.space == MemorySpace::Device) {
-            cmr->deallocate(p, req.size, req.stream);
-        } else if (req.space == MemorySpace::Unified) {
-            mmr->deallocate(p, req.size, req.stream);
-        }
+        req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 9113de573c..661c784aee 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -24,8 +24,6 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #endif
 
@@ -37,6 +35,7 @@
 #include <unordered_map>
 #include <vector>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -166,14 +165,11 @@ class StandardGpuResourcesImpl : public GpuResources {
      * to create a subclass only for the RMM memory resources.
      */
 
-    // cuda_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> cmr;
-
     // managed_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> mmr;
+    std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
 
     // pinned_memory_resource
-    std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+    std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
 #endif
 
     /// Pinned memory allocation for use with this GPU
@@ -260,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 890d489440..3b373b8280 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -340,6 +340,10 @@ void IVFBase::copyInvertedListsTo(InvertedLists* ivf) {
     }
 }
 
+void IVFBase::reconstruct_n(idx_t i0, idx_t n, float* out) {
+    FAISS_THROW_MSG("not implemented");
+}
+
 void IVFBase::addEncodedVectorsToList_(
         idx_t listId,
         const void* codes,
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 2bb319d002..04af9a906e 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -41,7 +41,7 @@ class IVFBase {
     virtual ~IVFBase();
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(idx_t numVecs);
+    virtual void reserveMemory(idx_t numVecs);
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
@@ -52,7 +52,7 @@ class IVFBase {
 
     /// After adding vectors, one can call this to reclaim device memory
     /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
+    virtual size_t reclaimMemory();
 
     /// Returns the number of inverted lists
     idx_t getNumLists() const;
@@ -109,9 +109,18 @@ class IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) = 0;
 
+    /*  It is used to reconstruct a given number of vectors in an Inverted File
+     * (IVF) index
+     *  @param i0          index of the first vector to reconstruct
+     *  @param n           number of vectors to reconstruct
+     *  @param out         This is a pointer to a buffer where the reconstructed
+     * vectors will be stored.
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t n, float* out);
+
    protected:
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
+    /// Adds a set of codes and indices to a list, with the
+    /// representation coming from the CPU equivalent
     virtual void addEncodedVectorsToList_(
             idx_t listId,
             // resident on the host
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 4607e49870..e0ecfd82cf 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -283,6 +283,53 @@ void IVFFlat::searchPreassigned(
             storePairs);
 }
 
+void IVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) {
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    for (idx_t list_no = 0; list_no < numLists_; list_no++) {
+        size_t list_size = deviceListData_[list_no]->numVecs;
+
+        auto idlist = getListIndices(list_no);
+
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+
+            // vector data in the non-interleaved format is laid out like:
+            // v0d0 v0d1 ... v0d(dim-1) v1d0 v1d1 ... v1d(dim-1)
+
+            // vector data in the interleaved format is laid out like:
+            // (v0d0 v1d0 ... v31d0) (v0d1 v1d1 ... v31d1)
+            // (v0d(dim - 1) ... v31d(dim-1))
+            // (v32d0 v33d0 ... v63d0) (... v63d(dim-1)) (v64d0 ...)
+
+            // where vectors are chunked into groups of 32, and each dimension
+            // for each of the 32 vectors is contiguous
+
+            auto vectorChunk = offset / 32;
+            auto vectorWithinChunk = offset % 32;
+
+            auto listDataPtr = (float*)deviceListData_[list_no]->data.data();
+            listDataPtr += vectorChunk * 32 * dim_ + vectorWithinChunk;
+
+            for (int d = 0; d < dim_; ++d) {
+                fromDevice<float>(
+                        listDataPtr + 32 * d,
+                        out + (id - i0) * dim_ + d,
+                        1,
+                        stream);
+            }
+        }
+    }
+}
+
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 246fc18b16..889b510795 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -51,6 +51,8 @@ class IVFFlat : public IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) override;
+
    protected:
     /// Returns the number of bytes in which an IVF list containing numVecs
     /// vectors is encoded on the device. Note that due to padding this is not
diff --git a/faiss/gpu/impl/IVFPQ.cuh b/faiss/gpu/impl/IVFPQ.cuh
index 3670e58edf..0d17b02c9b 100644
--- a/faiss/gpu/impl/IVFPQ.cuh
+++ b/faiss/gpu/impl/IVFPQ.cuh
@@ -39,7 +39,7 @@ class IVFPQ : public IVFBase {
 
     /// Enable or disable pre-computed codes. The quantizer is needed to gather
     /// the IVF centroids for use
-    void setPrecomputedCodes(Index* coarseQuantizer, bool enable);
+    virtual void setPrecomputedCodes(Index* coarseQuantizer, bool enable);
 
     /// Returns our set of sub-quantizers of the form
     /// (sub q)(code id)(sub dim)
@@ -134,7 +134,7 @@ class IVFPQ : public IVFBase {
             Tensor<float, 2, true>& outDistances,
             Tensor<idx_t, 2, true>& outIndices);
 
-   private:
+   protected:
     /// Number of sub-quantizers per vector
     const int numSubQuantizers_;
 
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
new file mode 100644
index 0000000000..292079321d
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <cstddef>
+#include <cstdint>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft_runtime/neighbors/cagra.hpp>
+#include <optional>
+#include <raft/neighbors/cagra.cuh>
+
+namespace faiss {
+namespace gpu {
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t intermediate_graph_degree,
+        idx_t graph_degree,
+        faiss::cagra_build_algo graph_build_algo,
+        size_t nn_descent_niter,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions,
+        std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<raft::neighbors::ivf_pq::search_params>
+                ivf_pq_search_params)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg),
+          index_params_(),
+          ivf_pq_params_(ivf_pq_params),
+          ivf_pq_search_params_(ivf_pq_search_params) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    index_params_.intermediate_graph_degree = intermediate_graph_degree;
+    index_params_.graph_degree = graph_degree;
+    index_params_.build_algo =
+            static_cast<raft::neighbors::cagra::graph_build_algo>(
+                    graph_build_algo);
+    index_params_.nn_descent_niter = nn_descent_niter;
+
+    if (!ivf_pq_params_) {
+        ivf_pq_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+    }
+    if (!ivf_pq_search_params_) {
+        ivf_pq_search_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+    }
+    index_params_.metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+    ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+
+    reset();
+}
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t n,
+        int graph_degree,
+        const float* distances,
+        const idx_t* knn_graph,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    if (distances_on_gpu && knn_graph_on_gpu) {
+        raft_handle.sync_stream();
+        // Copying to host so that raft::neighbors::cagra::index
+        // creates an owning copy of the knn graph on device
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        thrust::copy(
+                thrust::device_ptr<const idx_t>(knn_graph),
+                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds =
+                raft::make_device_matrix_view<const float, int64_t>(
+                        distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        std::copy(
+                knn_graph,
+                knn_graph + (n * graph_degree),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
+                distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else {
+        FAISS_THROW_MSG(
+                "distances and knn_graph must both be in device or host memory");
+    }
+}
+
+void RaftCagra::train(idx_t n, const float* x) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    if (index_params_.build_algo ==
+        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
+        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
+                raft::make_host_matrix<uint32_t, int64_t>(
+                        n, index_params_.intermediate_graph_degree));
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_d,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_h,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        }
+        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
+                n, index_params_.graph_degree);
+
+        raft::neighbors::cagra::optimize<uint32_t>(
+                raft_handle, knn_graph->view(), cagra_graph.view());
+
+        // free intermediate graph before trying to create the index
+        knn_graph.reset();
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_d,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_h,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        }
+
+    } else {
+        if (getDeviceForAddress(x) >= 0) {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        } else {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_host_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        }
+    }
+}
+
+void RaftCagra::search(
+        Tensor<float, 2, true>& queries,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        idx_t max_queries,
+        idx_t itopk_size,
+        idx_t max_iterations,
+        faiss::cagra_search_algo graph_search_algo,
+        idx_t team_size,
+        idx_t search_width,
+        idx_t min_iterations,
+        idx_t thread_block_size,
+        faiss::cagra_hash_mode hash_mode,
+        idx_t hashmap_min_bitlen,
+        float hashmap_max_fill_rate,
+        idx_t num_random_samplings,
+        idx_t rand_xor_mask) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t numQueries = queries.getSize(0);
+    idx_t cols = queries.getSize(1);
+    idx_t k_ = k;
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+
+    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
+            queries.data(), numQueries, cols);
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
+            outDistances.data(), numQueries, k_);
+    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
+            outIndices.data(), numQueries, k_);
+
+    raft::neighbors::cagra::search_params search_pams;
+    search_pams.max_queries = max_queries;
+    search_pams.itopk_size = itopk_size;
+    search_pams.max_iterations = max_iterations;
+    search_pams.algo =
+            static_cast<raft::neighbors::cagra::search_algo>(graph_search_algo);
+    search_pams.team_size = team_size;
+    search_pams.search_width = search_width;
+    search_pams.min_iterations = min_iterations;
+    search_pams.thread_block_size = thread_block_size;
+    search_pams.hashmap_mode =
+            static_cast<raft::neighbors::cagra::hash_mode>(hash_mode);
+    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
+    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+    search_pams.num_random_samplings = num_random_samplings;
+    search_pams.rand_xor_mask = rand_xor_mask;
+
+    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
+            raft_handle, numQueries, k_);
+
+    raft::runtime::neighbors::cagra::search(
+            raft_handle,
+            search_pams,
+            raft_knn_index.value(),
+            queries_view,
+            indices_copy.view(),
+            distances_view);
+    thrust::copy(
+            raft::resource::get_thrust_policy(raft_handle),
+            indices_copy.data_handle(),
+            indices_copy.data_handle() + indices_copy.size(),
+            indices_view.data_handle());
+}
+
+void RaftCagra::reset() {
+    raft_knn_index.reset();
+}
+
+idx_t RaftCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    return static_cast<idx_t>(raft_knn_index.value().graph_degree());
+}
+
+std::vector<idx_t> RaftCagra::get_knngraph() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_graph = raft_knn_index.value().graph();
+
+    std::vector<idx_t> host_graph(
+            device_graph.extent(0) * device_graph.extent(1));
+
+    raft_handle.sync_stream();
+
+    thrust::copy(
+            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+            thrust::device_ptr<const uint32_t>(
+                    device_graph.data_handle() + device_graph.size()),
+            host_graph.data());
+
+    return host_graph;
+}
+
+std::vector<float> RaftCagra::get_training_dataset() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_dataset = raft_knn_index.value().dataset();
+
+    std::vector<float> host_dataset(
+            device_dataset.extent(0) * device_dataset.extent(1));
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+            host_dataset.data(),
+            sizeof(float) * dim_,
+            device_dataset.data_handle(),
+            sizeof(float) * device_dataset.stride(0),
+            sizeof(float) * dim_,
+            device_dataset.extent(0),
+            cudaMemcpyDefault,
+            raft_handle.get_stream()));
+    raft_handle.sync_stream();
+
+    return host_dataset;
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
new file mode 100644
index 0000000000..95f6c03fca
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/GpuResources.h>
+#include <cstddef>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <optional>
+
+#include <faiss/MetricType.h>
+
+#include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+namespace faiss {
+
+/// Algorithm used to build underlying CAGRA graph
+enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
+
+enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
+
+enum class cagra_hash_mode { HASH, SMALL, AUTO };
+
+namespace gpu {
+
+class RaftCagra {
+   public:
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t intermediate_graph_degree,
+            idx_t graph_degree,
+            faiss::cagra_build_algo graph_build_algo,
+            size_t nn_descent_niter,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions,
+            std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+                    std::nullopt,
+            std::optional<raft::neighbors::ivf_pq::search_params>
+                    ivf_pq_search_params = std::nullopt);
+
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t n,
+            int graph_degree,
+            const float* distances,
+            const idx_t* knn_graph,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions);
+
+    ~RaftCagra() = default;
+
+    void train(idx_t n, const float* x);
+
+    void search(
+            Tensor<float, 2, true>& queries,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            idx_t max_queries,
+            idx_t itopk_size,
+            idx_t max_iterations,
+            faiss::cagra_search_algo graph_search_algo,
+            idx_t team_size,
+            idx_t search_width,
+            idx_t min_iterations,
+            idx_t thread_block_size,
+            faiss::cagra_hash_mode hash_mode,
+            idx_t hashmap_min_bitlen,
+            float hashmap_max_fill_rate,
+            idx_t num_random_samplings,
+            idx_t rand_xor_mask);
+
+    void reset();
+
+    idx_t get_knngraph_degree() const;
+
+    std::vector<idx_t> get_knngraph() const;
+
+    std::vector<float> get_training_dataset() const;
+
+   private:
+    /// Collection of GPU resources that we use
+    GpuResources* resources_;
+
+    /// Expected dimensionality of the vectors
+    const int dim_;
+
+    /// Metric type of the index
+    faiss::MetricType metric_;
+
+    /// Metric arg
+    float metricArg_;
+
+    /// Parameters to build RAFT CAGRA index
+    raft::neighbors::cagra::index_params index_params_;
+
+    /// Parameters to build CAGRA graph using IVF PQ
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+
+    /// Instance of trained RAFT CAGRA index
+    std::optional<raft::neighbors::cagra::index<float, uint32_t>>
+            raft_knn_index{std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index 8f5c491163..24a6d39604 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -20,7 +20,7 @@
  * limitations under the License.
  */
 
-#include <faiss/gpu/impl/RaftUtils.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/RaftFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 
@@ -91,7 +91,7 @@ void RaftFlatIndex::query(
                 outDistances.getSize(0),
                 outDistances.getSize(1));
 
-        DistanceType distance = faiss_to_raft(metric, exactDistance);
+        DistanceType distance = metricFaissToRaft(metric, exactDistance);
 
         std::optional<raft::device_vector_view<const float, int64_t>>
                 norms_view = raft::make_device_vector_view(
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index 010c5aebce..d3823bbf58 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -28,6 +28,7 @@
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <faiss/gpu/utils/DeviceVector.cuh>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -67,3 +68,4 @@ class RaftFlatIndex : public FlatIndex {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 2c6afb795c..0906a60f46 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -23,31 +23,19 @@
 #include <cstddef>
 #include <cstdint>
 
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
-#include <limits>
-#include <unordered_map>
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
 #include <raft/neighbors/ivf_flat_codepacker.hpp>
 #include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_helpers.cuh>
+
+#include <limits>
+#include <memory>
 
 namespace faiss {
 namespace gpu {
@@ -71,18 +59,31 @@ RaftIVFFlat::RaftIVFFlat(
                   useResidual,
                   scalarQ,
                   interleavedLayout,
+                  // skip ptr allocations in base class (handled by RAFT
+                  // internally)
                   indicesOptions,
                   space) {
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == INDICES_64_BIT,
             "only INDICES_64_BIT is supported for RAFT index");
-    reset();
 }
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
-/// Find the approximate k nearest neighbors for `queries` against
-/// our database
+void RaftIVFFlat::reserveMemory(idx_t numVecs) {
+    fprintf(stderr,
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+}
+
+void RaftIVFFlat::reset() {
+    raft_knn_index.reset();
+}
+
+void RaftIVFFlat::setRaftIndex(
+        raft::neighbors::ivf_flat::index<float, idx_t>&& idx) {
+    raft_knn_index.emplace(std::move(idx));
+}
+
 void RaftIVFFlat::search(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& queries,
@@ -90,7 +91,9 @@ void RaftIVFFlat::search(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices) {
-    // TODO: We probably don't want to ignore the coarse quantizer here...
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to modify the RAFT index if the quantizer was
+    /// modified externally
 
     uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
@@ -125,7 +128,7 @@ void RaftIVFFlat::search(
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
-    validRowIndices_(queries, nan_flag.data_handle());
+    validRowIndices(resources_, queries, nan_flag.data_handle());
 
     raft::linalg::map_offset(
             raft_handle,
@@ -154,65 +157,22 @@ void RaftIVFFlat::search(
             });
 }
 
-/// Classify and encode/add vectors to our IVF lists.
-/// The input data must be on our current device.
-/// Returns the number of vectors successfully added. Vectors may
-/// not be able to be added because they contain NaNs.
 idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
-    /// TODO: We probably don't want to ignore the coarse quantizer here
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// modified externally
 
-    idx_t n_rows = vecs.getSize(0);
+    FAISS_ASSERT(raft_knn_index.has_value());
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    /// Remove NaN values
-    auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
-
-    validRowIndices_(vecs, nan_flag.data_handle());
-
-    idx_t n_rows_valid = thrust::reduce(
-            raft_handle.get_thrust_policy(),
-            nan_flag.data_handle(),
-            nan_flag.data_handle() + n_rows,
-            0);
-
-    if (n_rows_valid < n_rows) {
-        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        auto count = thrust::make_counting_iterator(0);
+    /// Remove rows containing NaNs
+    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-        thrust::copy_if(
-                raft_handle.get_thrust_policy(),
-                count,
-                count + n_rows,
-                gather_indices.data_handle(),
-                [nan_flag = nan_flag.data_handle()] __device__(auto i) {
-                    return nan_flag[i];
-                });
-
-        raft::matrix::gather(
-                raft_handle,
-                raft::make_device_matrix_view<float, idx_t>(
-                        vecs.data(), n_rows, dim_),
-                raft::make_const_mdspan(gather_indices.view()),
-                (idx_t)16);
-
-        auto valid_indices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        raft::matrix::gather(
-                raft_handle,
-                raft::make_device_matrix_view<idx_t>(
-                        indices.data(), n_rows, (idx_t)1),
-                raft::make_const_mdspan(gather_indices.view()));
-    }
-
-    FAISS_ASSERT(raft_knn_index.has_value());
     raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
             raft_handle,
             raft::make_device_matrix_view<const float, idx_t>(
@@ -225,10 +185,6 @@ idx_t RaftIVFFlat::addVectors(
     return n_rows_valid;
 }
 
-void RaftIVFFlat::reset() {
-    raft_knn_index.reset();
-}
-
 idx_t RaftIVFFlat::getListLength(idx_t listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
@@ -259,10 +215,11 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     // fetch the list indices ptr on host
     idx_t* list_indices_ptr;
 
-    // fetch the list indices ptr on host
     raft::update_host(
             &list_indices_ptr,
-            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            const_cast<idx_t**>(
+                    raft_knn_index.value().inds_ptrs().data_handle()) +
+                    listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -278,7 +235,7 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(
         idx_t listId,
         bool gpuFormat) const {
     if (gpuFormat) {
-        FAISS_THROW_MSG("gpuFormat is not suppported for raft indices");
+        FAISS_THROW_MSG("gpuFormat should be false for RAFT indices");
     }
     FAISS_ASSERT(raft_knn_index.has_value());
 
@@ -334,59 +291,71 @@ void RaftIVFFlat::searchPreassigned(
 }
 
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
-    idx_t quantizer_ntotal = quantizer->ntotal;
+    FAISS_THROW_IF_NOT(quantizer->is_trained);
 
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
+    // Must match our basic IVF parameters
+    FAISS_THROW_IF_NOT(quantizer->d == getDim());
+    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
 
-    auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
+    size_t total_elems = quantizer->ntotal * quantizer->d;
 
-    raft::logger::get().set_level(RAFT_LEVEL_TRACE);
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     raft::neighbors::ivf_flat::index_params pams;
     pams.add_data_on_build = false;
-
-    pams.n_lists = this->numLists_;
-
-    switch (this->metric_) {
-        case faiss::METRIC_L2:
-            pams.metric = raft::distance::DistanceType::L2Expanded;
-            break;
-        case faiss::METRIC_INNER_PRODUCT:
-            pams.metric = raft::distance::DistanceType::InnerProduct;
-            break;
-        default:
-            FAISS_THROW_MSG("Metric is not supported.");
+    pams.metric = metricFaissToRaft(metric_, false);
+    pams.n_lists = numLists_;
+    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+
+    raft::neighbors::ivf_flat::helpers::reset_index(
+            raft_handle, &raft_knn_index.value());
+
+    // If the index instance is a GpuIndexFlat, then we can use direct access to
+    // the centroids within.
+    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
+    if (gpuQ) {
+        auto gpuData = gpuQ->getGpuData();
+
+        if (gpuData->getUseFloat16()) {
+            // The FlatIndex keeps its data in float16; we need to reconstruct
+            // as float32 and store locally
+            DeviceTensor<float, 2, true> centroids(
+                    resources_,
+                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                    {getNumLists(), getDim()});
+
+            gpuData->reconstruct(0, gpuData->getSize(), centroids);
+
+            raft::update_device(
+                    raft_knn_index.value().centers().data_handle(),
+                    centroids.data(),
+                    total_elems,
+                    stream);
+        } else {
+            /// No reconstruct needed since the centers are already in float32
+            auto centroids = gpuData->getVectorsFloat32Ref();
+
+            raft::update_device(
+                    raft_knn_index.value().centers().data_handle(),
+                    centroids.data(),
+                    total_elems,
+                    stream);
+        }
+    } else {
+        // Otherwise, we need to reconstruct all vectors from the index and copy
+        // them to the GPU, in order to have access as needed for residual
+        // computation
+        auto vecs = std::vector<float>(getNumLists() * getDim());
+        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
+
+        raft::update_device(
+                raft_knn_index.value().centers().data_handle(),
+                vecs.data(),
+                total_elems,
+                stream);
     }
-
-    raft_knn_index.emplace(raft_handle, pams, (uint32_t)this->dim_);
-
-    cudaMemsetAsync(
-            raft_knn_index.value().list_sizes().data_handle(),
-            0,
-            raft_knn_index.value().list_sizes().size() * sizeof(uint32_t),
-            stream);
-    cudaMemsetAsync(
-            raft_knn_index.value().data_ptrs().data_handle(),
-            0,
-            raft_knn_index.value().data_ptrs().size() * sizeof(float*),
-            stream);
-    cudaMemsetAsync(
-            raft_knn_index.value().inds_ptrs().data_handle(),
-            0,
-            raft_knn_index.value().inds_ptrs().size() * sizeof(idx_t*),
-            stream);
-
-    /// Copy (reconstructed) centroids over, rather than re-training
-    std::vector<float> buf_host(total_elems);
-    quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-
-    raft::update_device(
-            raft_knn_index.value().centers().data_handle(),
-            buf_host.data(),
-            total_elems,
-            stream);
 }
 
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
@@ -422,6 +391,9 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
         // store the list size
         list_sizes_[i] = static_cast<uint32_t>(listSize);
 
+        // This RAFT list must currently be empty
+        FAISS_ASSERT(getListLength(i) == 0);
+
         raft::neighbors::ivf::resize_list(
                 raft_handle,
                 raft_lists[i],
@@ -431,7 +403,8 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     // Update the pointers and the sizes
-    raft_knn_index.value().recompute_internal_state(raft_handle);
+    raft::neighbors::ivf_flat::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);
@@ -483,12 +456,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
         idx_t numVecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
-    // This list must already exist
-    FAISS_ASSERT(raft_knn_index.has_value());
-
-    // This list must currently be empty
-    FAISS_ASSERT(getListLength(listId) == 0);
-
     // If there's nothing to add, then there's nothing we have to do
     if (numVecs == 0) {
         return;
@@ -496,7 +463,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
 
     // The GPU might have a different layout of the memory
     auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
 
     // We only have int32 length representations on the GPU per each
     // list; the length is in sizeof(char)
@@ -541,27 +507,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
 }
 
-void RaftIVFFlat::validRowIndices_(
-        Tensor<float, 2, true>& vecs,
-        bool* nan_flag) {
-    raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    idx_t n_rows = vecs.getSize(0);
-
-    thrust::fill_n(raft_handle.get_thrust_policy(), nan_flag, n_rows, true);
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view<bool, idx_t>(nan_flag, n_rows),
-            [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) {
-                for (idx_t col = 0; col < dim_; col++) {
-                    if (!isfinite(vecs[i * dim_ + col])) {
-                        return false;
-                    }
-                }
-                return true;
-            });
-}
-
 RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
         size_t list_size,
         uint32_t dim,
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 3aba501c9f..4f8c89ecb0 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -22,16 +22,15 @@
 
 #pragma once
 
+#include <faiss/impl/CodePacker.h>
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 
-#include <faiss/impl/CodePacker.h>
-
 #include <raft/neighbors/ivf_flat.cuh>
 
 #include <optional>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -52,6 +51,9 @@ class RaftIVFFlat : public IVFFlat {
 
     ~RaftIVFFlat() override;
 
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(idx_t numVecs) override;
+
     /// Find the approximate k nearest neigbors for `queries` against
     /// our database
     void search(
@@ -83,11 +85,7 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& vecs,
             Tensor<idx_t, 1, true>& indices) override;
 
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    //     void reserveMemory(idx_t numVecs) override;
-
-    /// Clear out all inverted lists, but retain the coarse quantizer
-    /// and the product quantizer info
+    /// Clear out the Raft index
     void reset() override;
 
     /// For debugging purposes, return the list length of a particular
@@ -101,15 +99,17 @@ class RaftIVFFlat : public IVFFlat {
     std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
             const override;
 
+    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// or GPU quantizer
     void updateQuantizer(Index* quantizer) override;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Filter out matrix rows containing NaN values
-    void validRowIndices_(Tensor<float, 2, true>& vecs, bool* nan_flag);
+    /// Replace the Raft index
+    void setRaftIndex(raft::neighbors::ivf_flat::index<float, idx_t>&& idx);
 
-   protected:
+   private:
     /// Adds a set of codes and indices to a list, with the representation
     /// coming from the CPU equivalent
     void addEncodedVectorsToList_(
@@ -147,3 +147,4 @@ struct RaftIVFFlatCodePackerInterleaved : CodePacker {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/RaftIVFPQ.cu b/faiss/gpu/impl/RaftIVFPQ.cu
new file mode 100644
index 0000000000..3a2a0a4218
--- /dev/null
+++ b/faiss/gpu/impl/RaftIVFPQ.cu
@@ -0,0 +1,546 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/RaftIVFPQ.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
+
+#include <limits>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+RaftIVFPQ::RaftIVFPQ(
+        GpuResources* resources,
+        int dim,
+        idx_t nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        int numSubQuantizers,
+        int bitsPerSubQuantizer,
+        bool useFloat16LookupTables,
+        bool useMMCodeDistance,
+        bool interleavedLayout,
+        float* pqCentroidData,
+        IndicesOptions indicesOptions,
+        MemorySpace space)
+        : IVFPQ(resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                // skip ptr allocations in base class (handled by RAFT
+                // internally) false,
+                pqCentroidData,
+                indicesOptions,
+                space) {
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT index");
+}
+
+RaftIVFPQ::~RaftIVFPQ() {}
+
+void RaftIVFPQ::reserveMemory(idx_t numVecs) {
+    fprintf(stderr,
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+}
+
+void RaftIVFPQ::reset() {
+    raft_knn_index.reset();
+}
+
+size_t RaftIVFPQ::reclaimMemory() {
+    fprintf(stderr,
+            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with RAFT enabled.\n");
+    return 0;
+}
+
+void RaftIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {}
+
+idx_t RaftIVFPQ::getListLength(idx_t listId) const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    uint32_t size;
+    raft::update_host(
+            &size,
+            raft_knn_index.value().list_sizes().data_handle() + listId,
+            1,
+            raft_handle.get_stream());
+    raft_handle.sync_stream();
+
+    return static_cast<int>(size);
+}
+
+void RaftIVFPQ::updateQuantizer(Index* quantizer) {
+    FAISS_THROW_IF_NOT(quantizer->is_trained);
+
+    // Must match our basic IVF parameters
+    FAISS_THROW_IF_NOT(quantizer->d == getDim());
+    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    raft::neighbors::ivf_pq::index_params pams;
+    pams.metric = metricFaissToRaft(metric_, false);
+    pams.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    pams.n_lists = numLists_;
+    pams.pq_bits = bitsPerSubQuantizer_;
+    pams.pq_dim = numSubQuantizers_;
+    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+
+    raft::neighbors::ivf_pq::helpers::reset_index(
+            raft_handle, &raft_knn_index.value());
+    raft::neighbors::ivf_pq::helpers::make_rotation_matrix(
+            raft_handle, &(raft_knn_index.value()), false);
+
+    // If the index instance is a GpuIndexFlat, then we can use direct access to
+    // the centroids within.
+    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
+
+    if (gpuQ) {
+        auto gpuData = gpuQ->getGpuData();
+
+        if (gpuData->getUseFloat16()) {
+            DeviceTensor<float, 2, true> centroids(
+                    resources_,
+                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                    {getNumLists(), getDim()});
+
+            // The FlatIndex keeps its data in float16; we need to reconstruct
+            // as float32 and store locally
+            gpuData->reconstruct(0, gpuData->getSize(), centroids);
+
+            raft::neighbors::ivf_pq::helpers::set_centers(
+                    raft_handle,
+                    &(raft_knn_index.value()),
+                    raft::make_device_matrix_view<float, uint32_t>(
+                            centroids.data(), numLists_, dim_));
+        } else {
+            /// No reconstruct needed since the centers are already in float32
+            // The FlatIndex keeps its data in float32, so we can merely
+            // reference it
+            auto centroids = gpuData->getVectorsFloat32Ref();
+
+            raft::neighbors::ivf_pq::helpers::set_centers(
+                    raft_handle,
+                    &(raft_knn_index.value()),
+                    raft::make_device_matrix_view<float, uint32_t>(
+                            centroids.data(), numLists_, dim_));
+        }
+    } else {
+        DeviceTensor<float, 2, true> centroids(
+                resources_,
+                makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                {getNumLists(), getDim()});
+
+        // Otherwise, we need to reconstruct all vectors from the index and copy
+        // them to the GPU, in order to have access as needed for residual
+        // computation
+        auto vecs = std::vector<float>(getNumLists() * getDim());
+        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
+
+        centroids.copyFrom(vecs, stream);
+
+        raft::neighbors::ivf_pq::helpers::set_centers(
+                raft_handle,
+                &(raft_knn_index.value()),
+                raft::make_device_matrix_view<float, uint32_t>(
+                        centroids.data(), numLists_, dim_));
+    }
+
+    setPQCentroids_();
+}
+
+/// Return the list indices of a particular list back to the CPU
+std::vector<idx_t> RaftIVFPQ::getListIndices(idx_t listId) const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    idx_t listSize = getListLength(listId);
+
+    std::vector<idx_t> vec(listSize);
+
+    // fetch the list indices ptr on host
+    idx_t* list_indices_ptr;
+
+    raft::update_host(
+            &list_indices_ptr,
+            const_cast<idx_t**>(
+                    raft_knn_index.value().inds_ptrs().data_handle()) +
+                    listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
+
+    raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
+    raft_handle.sync_stream();
+
+    return vec;
+}
+
+/// Performs search when we are already given the IVF cells to look at
+/// (GpuIndexIVF::search_preassigned implementation)
+void RaftIVFPQ::searchPreassigned(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<float, 2, true>& ivfDistances,
+        Tensor<idx_t, 2, true>& ivfAssignments,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        bool storePairs) {
+    // TODO: Fill this in!
+}
+
+size_t RaftIVFPQ::getGpuListEncodingSize_(idx_t listId) {
+    return static_cast<size_t>(
+            raft_knn_index.value().get_list_size_in_bytes(listId));
+}
+
+/// Return the encoded vectors of a particular list back to the CPU
+std::vector<uint8_t> RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
+        const {
+    if (gpuFormat) {
+        FAISS_THROW_MSG(
+                "gpuFormat should be false for RAFT indices. Unpacked codes are flat.");
+    }
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    idx_t listSize = getListLength(listId);
+
+    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize);
+
+    std::vector<uint8_t> flat_codes(
+            cpuListSizeInBytes, static_cast<uint8_t>(0));
+
+    idx_t maxBatchSize = 65536;
+    for (idx_t offset_b = 0; offset_b < listSize; offset_b += maxBatchSize) {
+        uint32_t batchSize = min(maxBatchSize, listSize - offset_b);
+        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
+        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
+
+        // Fetch flat PQ codes for the current batch
+        auto codes_d = raft::make_device_vector<uint8_t>(
+                raft_handle, static_cast<uint32_t>(bufferSize));
+
+        raft::neighbors::ivf_pq::helpers::unpack_contiguous_list_data(
+                raft_handle,
+                raft_knn_index.value(),
+                codes_d.data_handle(),
+                batchSize,
+                listId,
+                offset_b);
+
+        // Copy the flat PQ codes to host
+        raft::update_host(
+                flat_codes.data() + codesOffset,
+                codes_d.data_handle(),
+                bufferSize,
+                stream);
+        raft_handle.sync_stream();
+    }
+
+    return flat_codes;
+}
+
+/// Find the approximate k nearest neighbors for `queries` against
+/// our database
+void RaftIVFPQ::search(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& queries,
+        int nprobe,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices) {
+    uint32_t numQueries = queries.getSize(0);
+    uint32_t cols = queries.getSize(1);
+    idx_t k_ = std::min(static_cast<idx_t>(k), raft_knn_index.value().size());
+
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    raft::neighbors::ivf_pq::search_params pams;
+    pams.n_probes = nprobe;
+    pams.lut_dtype = useFloat16LookupTables_ ? CUDA_R_16F : CUDA_R_32F;
+
+    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
+            queries.data(), (idx_t)numQueries, (idx_t)cols);
+    auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
+            outIndices.data(), (idx_t)numQueries, (idx_t)k_);
+    auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
+            outDistances.data(), (idx_t)numQueries, (idx_t)k_);
+
+    raft::neighbors::ivf_pq::search<float, idx_t>(
+            raft_handle,
+            pams,
+            raft_knn_index.value(),
+            queries_view,
+            out_inds_view,
+            out_dists_view);
+
+    /// Identify NaN rows and mask their nearest neighbors
+    auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
+
+    validRowIndices(resources_, queries, nan_flag.data_handle());
+
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_inds = outIndices.data(),
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return idx_t(-1);
+                return out_inds[i];
+            });
+
+    float max_val = std::numeric_limits<float>::max();
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_dists = outDistances.data(),
+             max_val,
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return max_val;
+                return out_dists[i];
+            });
+    raft_handle.sync_stream();
+}
+
+idx_t RaftIVFPQ::addVectors(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices) {
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// modified externally
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    /// Remove rows containing NaNs
+    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
+
+    raft_knn_index.emplace(raft::neighbors::ivf_pq::extend(
+            raft_handle,
+            raft::make_device_matrix_view<const float, idx_t>(
+                    vecs.data(), n_rows_valid, dim_),
+            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
+                    raft::make_device_vector_view<const idx_t, idx_t>(
+                            indices.data(), n_rows_valid)),
+            raft_knn_index.value()));
+
+    return n_rows_valid;
+}
+
+void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
+    size_t nlist = ivf ? ivf->nlist : 0;
+    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    std::vector<uint32_t> list_sizes_(nlist);
+    std::vector<idx_t> indices_(ntotal);
+
+    // the index must already exist
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    auto& raft_lists = raft_knn_index.value().lists();
+
+    // conservative memory alloc for cloning cpu inverted lists
+    raft::neighbors::ivf_pq::list_spec<uint32_t, idx_t> raft_list_spec{
+            static_cast<uint32_t>(bitsPerSubQuantizer_),
+            static_cast<uint32_t>(numSubQuantizers_),
+            true};
+
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+
+        // GPU index can only support max int entries per list
+        FAISS_THROW_IF_NOT_FMT(
+                listSize <= (size_t)std::numeric_limits<int>::max(),
+                "GPU inverted list can only support "
+                "%zu entries; %zu found",
+                (size_t)std::numeric_limits<int>::max(),
+                listSize);
+
+        // store the list size
+        list_sizes_[i] = static_cast<uint32_t>(listSize);
+
+        // This RAFT list must currently be empty
+        FAISS_ASSERT(getListLength(i) == 0);
+
+        raft::neighbors::ivf::resize_list(
+                raft_handle,
+                raft_lists[i],
+                raft_list_spec,
+                static_cast<uint32_t>(listSize),
+                static_cast<uint32_t>(0));
+    }
+
+    raft::update_device(
+            raft_knn_index.value().list_sizes().data_handle(),
+            list_sizes_.data(),
+            nlist,
+            raft_handle.get_stream());
+
+    //     Update the pointers and the sizes
+    raft::neighbors::ivf_pq::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
+
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+        addEncodedVectorsToList_(
+                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+    }
+}
+
+void RaftIVFPQ::setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx) {
+    raft_knn_index.emplace(std::move(idx));
+    setBasePQCentroids_();
+}
+
+void RaftIVFPQ::addEncodedVectorsToList_(
+        idx_t listId,
+        const void* codes,
+        const idx_t* indices,
+        idx_t numVecs) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    // If there's nothing to add, then there's nothing we have to do
+    if (numVecs == 0) {
+        return;
+    }
+
+    // The GPU might have a different layout of the memory
+    auto gpuListSizeInBytes = getGpuListEncodingSize_(listId);
+
+    // We only have int32 length representations on the GPU per each
+    // list; the length is in sizeof(char)
+    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+
+    idx_t maxBatchSize = 4096;
+    for (idx_t offset_b = 0; offset_b < numVecs; offset_b += maxBatchSize) {
+        uint32_t batchSize = min(maxBatchSize, numVecs - offset_b);
+        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
+        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
+
+        // Translate the codes as needed to our preferred form
+        auto codes_d = raft::make_device_vector<uint8_t>(
+                raft_handle, static_cast<uint32_t>(bufferSize));
+        raft::update_device(
+                codes_d.data_handle(),
+                static_cast<const uint8_t*>(codes) + codesOffset,
+                bufferSize,
+                stream);
+
+        raft::neighbors::ivf_pq::helpers::pack_contiguous_list_data(
+                raft_handle,
+                &(raft_knn_index.value()),
+                codes_d.data_handle(),
+                batchSize,
+                listId,
+                offset_b);
+    }
+
+    /// Handle the indices as well
+    idx_t* list_indices_ptr;
+
+    // fetch the list indices ptr on host
+    raft::update_host(
+            &list_indices_ptr,
+            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
+
+    raft::update_device(list_indices_ptr, indices, numVecs, stream);
+}
+
+void RaftIVFPQ::setPQCentroids_() {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    raft::copy(
+            raft_knn_index.value().pq_centers().data_handle(),
+            pqCentroidsInnermostCode_.data(),
+            pqCentroidsInnermostCode_.numElements(),
+            stream);
+}
+
+void RaftIVFPQ::setBasePQCentroids_() {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    raft::copy(
+            pqCentroidsInnermostCode_.data(),
+            raft_knn_index.value().pq_centers().data_handle(),
+            raft_knn_index.value().pq_centers().size(),
+            stream);
+
+    DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
+            resources_,
+            makeDevAlloc(AllocType::Quantizer, stream),
+            {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
+
+    runTransposeAny(
+            pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, stream);
+
+    pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFPQ.cuh b/faiss/gpu/impl/RaftIVFPQ.cuh
new file mode 100644
index 0000000000..a79db3c40d
--- /dev/null
+++ b/faiss/gpu/impl/RaftIVFPQ.cuh
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFPQ.cuh>
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+#include <optional>
+
+#pragma GCC visibility push(default)
+namespace faiss {
+namespace gpu {
+/// Implementing class for IVFPQ on the GPU
+class RaftIVFPQ : public IVFPQ {
+   public:
+    RaftIVFPQ(
+            GpuResources* resources,
+            int dim,
+            idx_t nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            int numSubQuantizers,
+            int bitsPerSubQuantizer,
+            bool useFloat16LookupTables,
+            bool useMMCodeDistance,
+            bool interleavedLayout,
+            float* pqCentroidData,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
+    ~RaftIVFPQ() override;
+
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(idx_t numVecs) override;
+
+    /// Clear out the RAFT index
+    void reset() override;
+
+    /// After adding vectors, one can call this to reclaim device memory
+    /// to exactly the amount needed. Returns space reclaimed in bytes
+    size_t reclaimMemory() override;
+
+    /// Enable or disable pre-computed codes. The quantizer is needed to gather
+    /// the IVF centroids for use
+    void setPrecomputedCodes(Index* coarseQuantizer, bool enable) override;
+
+    /// Find the approximate k nearest neigbors for `queries` against
+    /// our database
+    void search(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& queries,
+            int nprobe,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices) override;
+
+    /// Performs search when we are already given the IVF cells to look at
+    /// (GpuIndexIVF::search_preassigned implementation)
+    void searchPreassigned(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& vecs,
+            Tensor<float, 2, true>& ivfDistances,
+            Tensor<idx_t, 2, true>& ivfAssignments,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            bool storePairs) override;
+
+    /// Return the encoded vectors of a particular list back to the CPU
+    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
+            const override;
+
+    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// or GPU quantizer
+    void updateQuantizer(Index* quantizer) override;
+
+    /// Copy all inverted lists from a CPU representation to ourselves
+    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+
+    /// Replace the Raft index
+    void setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx);
+
+    /// Classify and encode/add vectors to our IVF lists.
+    /// The input data must be on our current device.
+    /// Returns the number of vectors successfully added. Vectors may
+    /// not be able to be added because they contain NaNs.
+    idx_t addVectors(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& vecs,
+            Tensor<idx_t, 1, true>& indices) override;
+
+    /// For debugging purposes, return the list length of a particular
+    /// list
+    idx_t getListLength(idx_t listId) const override;
+
+    /// Return the list indices of a particular list back to the CPU
+    std::vector<idx_t> getListIndices(idx_t listId) const override;
+
+   private:
+    /// Adds a set of codes and indices to a list, with the representation
+    /// coming from the CPU equivalent
+    void addEncodedVectorsToList_(
+            idx_t listId,
+            // resident on the host
+            const void* codes,
+            // resident on the host
+            const idx_t* indices,
+            idx_t numVecs) override;
+
+    /// Returns the encoding size for a PQ-encoded IVF list
+    size_t getGpuListEncodingSize_(idx_t listId);
+
+    /// Copy the PQ centroids to the Raft index. The data is already in the
+    /// preferred format with the transpose performed by the IVFPQ class helper.
+    void setPQCentroids_();
+
+    /// Update the product quantizer centroids buffer held in the IVFPQ class.
+    /// Used when the RAFT index was updated externally.
+    void setBasePQCentroids_();
+
+    /// optional around the Raft IVF-PQ index
+    std::optional<raft::neighbors::ivf_pq::index<idx_t>> raft_knn_index{
+            std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/perf/PerfClustering.cpp b/faiss/gpu/perf/PerfClustering.cpp
index 0322f0e490..532557fe20 100644
--- a/faiss/gpu/perf/PerfClustering.cpp
+++ b/faiss/gpu/perf/PerfClustering.cpp
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include <cuda_profiler_api.h>
+#include <faiss/impl/AuxIndexStructures.h>
 
 DEFINE_int32(num, 10000, "# of vecs");
 DEFINE_int32(k, 100, "# of clusters");
@@ -34,6 +35,7 @@ DEFINE_int64(
         "minimum size to use CPU -> GPU paged copies");
 DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
 DEFINE_int32(max_points, -1, "max points per centroid");
+DEFINE_double(timeout, 0, "timeout in seconds");
 
 using namespace faiss::gpu;
 
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
         cp.max_points_per_centroid = FLAGS_max_points;
     }
 
+    auto tc = new faiss::TimeoutCallback();
+    faiss::InterruptCallback::instance.reset(tc);
+
     faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
 
     // Time k-means
     {
+        tc->set_timeout(FLAGS_timeout);
         CpuTimer timer;
 
         kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 9300deead9..60f78ef74f 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -21,7 +21,6 @@ find_package(CUDAToolkit REQUIRED)
 
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
-
 add_library(faiss_gpu_test_helper TestUtils.cpp)
 target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>)
 
@@ -42,6 +41,9 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
+if(FAISS_ENABLE_RAFT)
+  faiss_gpu_test(TestGpuIndexCagra.cu)
+endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
new file mode 100644
index 0000000000..8d330a81cb
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -0,0 +1,474 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/MetricType.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/distances.h>
+#include <cstddef>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <optional>
+#include <vector>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/neighborhood_recall.cuh>
+
+struct Options {
+    Options() {
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(4, 10);
+        numAdd = faiss::gpu::randVal(1000, 3000);
+
+        graphDegree = faiss::gpu::randSelect({32, 64});
+        intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
+        buildAlgo = faiss::gpu::randSelect(
+                {faiss::gpu::graph_build_algo::IVF_PQ,
+                 faiss::gpu::graph_build_algo::NN_DESCENT});
+
+        numQuery = faiss::gpu::randVal(32, 100);
+        k = faiss::gpu::randVal(10, 30);
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
+            << dim << " graphDegree " << graphDegree
+            << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo) << " numQuery "
+            << numQuery << " k " << k;
+
+        return str.str();
+    }
+
+    int numTrain;
+    int numAdd;
+    int dim;
+    size_t graphDegree;
+    size_t intermediateGraphDegree;
+    faiss::gpu::graph_build_algo buildAlgo;
+    int numQuery;
+    int k;
+    int device;
+};
+
+void queryTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // train gpu index
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        // test quality of searches
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                test_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                test_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_L2) {
+    queryTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_IP) {
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+void copyToTest(
+        faiss::MetricType metric,
+        double expected_recall,
+        bool base_level_only) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
+        }
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // train gpu index and copy to cpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::IndexHNSWCagra copiedCpuIndex(
+                opt.dim, opt.graphDegree / 2, metric);
+        copiedCpuIndex.base_level_only = base_level_only;
+        gpuIndex.copyTo(&copiedCpuIndex);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
+
+        // add more vecs to copied cpu index
+        if (!base_level_only) {
+            copiedCpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // add more vecs to cpu index
+        if (!base_level_only) {
+            cpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // query indexes
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParamstwo;
+        cpuSearchParamstwo.efSearch = opt.k * 2;
+        copiedCpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyRefDistance.data(),
+                copyRefIndices.data(),
+                &cpuSearchParamstwo);
+
+        // test quality of search
+        auto gpuRes = res.getResources();
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                copyRefDistanceDev.data(), opt.numQuery, opt.k);
+        auto copy_ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_ref_dis_mds);
+        auto copy_ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_ref_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                copy_ref_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
+    copyToTest(faiss::METRIC_L2, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_L2, 0.98, true);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
+}
+
+void copyFromTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // convert to gpu index
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
+        copiedGpuIndex.copyFrom(&cpuIndex);
+
+        // train gpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        copiedGpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyTestDistance.data(),
+                copyTestIndices.data());
+
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        // test quality of searches
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        auto copy_test_dis_mds =
+                raft::make_device_matrix_view<const float, int>(
+                        copyTestDistance.data(), opt.numQuery, opt.k);
+        auto copy_test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_test_dis_mds);
+
+        auto copy_test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyTestIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_test_ind_mds,
+                test_ind_mds,
+                recall_score.view(),
+                copy_test_dis_mds_opt,
+                test_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
+    copyFromTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 6d9c83e547..06b860ded4 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -268,6 +268,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = 0;
     config.useFloat16 = false;
+    config.use_raft = false;
     int dim = 128;
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
@@ -766,4 +767,4 @@ int main(int argc, char** argv) {
     faiss::gpu::setTestSeed(100);
 
     return RUN_ALL_TESTS();
-}
\ No newline at end of file
+}
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 9fb88e2687..28eefec308 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -23,6 +23,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -30,7 +31,6 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
-#include "faiss/gpu/GpuIndicesOptions.h"
 
 // FIXME: figure out a better way to test fp16
 constexpr float kF16MaxRelErr = 0.3f;
@@ -57,7 +57,7 @@ struct Options {
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-        use_raft = false;
+        useRaft = false;
     }
 
     std::string toString() const {
@@ -65,7 +65,7 @@ struct Options {
         str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
             << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
             << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt << " use_raft " << use_raft;
+            << indicesOpt << " useRaft " << useRaft;
 
         return str.str();
     }
@@ -79,7 +79,7 @@ struct Options {
     int k;
     int device;
     faiss::gpu::IndicesOptions indicesOpt;
-    bool use_raft;
+    bool useRaft;
 };
 
 void queryTest(
@@ -110,7 +110,7 @@ void queryTest(
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = opt.use_raft;
+        config.use_raft = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -137,7 +137,7 @@ void queryTest(
 void addTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer,
-        bool use_raft) {
+        bool useRaft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
 
@@ -162,9 +162,9 @@ void addTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions =
-                use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+                useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = use_raft;
+        config.use_raft = useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -188,7 +188,7 @@ void addTest(
     }
 }
 
-void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
+void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -199,9 +199,9 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = use_raft;
+    config.use_raft = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -241,7 +241,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
+void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -260,9 +260,9 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = use_raft;
+    config.use_raft = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
     gpuIndex.nprobe = 1;
@@ -334,7 +334,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -345,7 +345,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -358,7 +358,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -371,7 +371,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, true);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
 #endif
@@ -382,7 +382,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 #endif
@@ -399,7 +399,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -411,7 +411,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -423,7 +423,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -435,7 +435,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -499,6 +499,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -571,6 +572,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -639,6 +641,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = false;
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -720,6 +723,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -797,6 +801,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -837,6 +842,71 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 #endif
 }
 
+TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
+    config.use_raft = false;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
+
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+
+    std::vector<float> gpuVals(opt.numAdd * opt.dim);
+
+    gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+
+    std::vector<float> cpuVals(opt.numAdd * opt.dim);
+
+    cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_32_BIT;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex1(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex1.nprobe = opt.nprobe;
+
+    gpuIndex1.train(opt.numTrain, trainVecs.data());
+    gpuIndex1.add(opt.numAdd, addVecs.data());
+
+    gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_CPU;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex2(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex2.nprobe = opt.nprobe;
+
+    gpuIndex2.train(opt.numTrain, trainVecs.data());
+    gpuIndex2.add(opt.numAdd, addVecs.data());
+
+    gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
@@ -844,4 +914,4 @@ int main(int argc, char** argv) {
     faiss::gpu::setTestSeed(100);
 
     return RUN_ALL_TESTS();
-}
\ No newline at end of file
+}
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 1bdef31914..9cc52bc788 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -35,6 +35,22 @@ void pickEncoding(int& codes, int& dim) {
     }
 }
 
+void pickRaftEncoding(int& codes, int& dim, int bitsPerCode) {
+    // Above 32 doesn't work with no precomputed codes
+    std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
+
+    while (true) {
+        codes = faiss::gpu::randVal(0, 96);
+        dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)];
+
+        // for such a small test, super-low or high dim is more likely to
+        // generate comparison errors
+        if (dim < 256 && dim >= 64 && (codes * bitsPerCode) % 8 == 0) {
+            return;
+        }
+    }
+}
+
 struct Options {
     Options() {
         numAdd = faiss::gpu::randVal(2000, 5000);
@@ -43,9 +59,10 @@ struct Options {
 
         pickEncoding(codes, dim);
 
-        // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we officially
-        //   support non-multiple of 8 subcodes for IVFPQ.
+        // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we
+        // officially support non-multiple of 8 subcodes for IVFPQ.
         bitsPerCode = 8;
+
         nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids);
         numQuery = faiss::gpu::randVal(4, 8);
 
@@ -66,6 +83,9 @@ struct Options {
         }
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+        interleavedLayout = false;
+        useRaft = false;
     }
 
     std::string toString() const {
@@ -105,50 +125,66 @@ struct Options {
     faiss::gpu::IndicesOptions indicesOpt;
     bool useFloat16;
     int device;
+    bool interleavedLayout;
+    bool useRaft;
 };
 
-TEST(TestGpuIndexIVFPQ, Query_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
+void queryTest(Options opt, faiss::MetricType metricType) {
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
+    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
+    faiss::Index* quantizer = metricType == faiss::METRIC_L2
+            ? (faiss::Index*)&coarseQuantizerL2
+            : (faiss::Index*)&coarseQuantizerIP;
 
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
+    faiss::IndexIVFPQ cpuIndex(
+            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
+    cpuIndex.metric_type = metricType;
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
 
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
+    // Use the default temporary memory management to test the memory
+    // manager
+    faiss::gpu::StandardGpuResources res;
 
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = (tries % 2 == 0);
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            opt.getCompareEpsilon(),
+            opt.getPctMaxDiff1(),
+            opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, Query_L2) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.usePrecomputed = (tries % 2 == 0);
+        queryTest(opt, faiss::MetricType::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Query_IP) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
     }
 }
 
@@ -161,45 +197,10 @@ TEST(TestGpuIndexIVFPQ, LargeBatch) {
         opt.dim = 4;
         opt.numQuery = 100000;
         opt.codes = 2;
+        opt.usePrecomputed = usePrecomputed;
+        opt.useFloat16 = false;
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = false;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+        queryTest(opt, faiss::MetricType::METRIC_L2);
     }
 }
 
@@ -234,6 +235,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.usePrecomputedTables = false;
         config.useMMCodeDistance = true;
         config.indicesOptions = opt.indicesOpt;
+        config.use_raft = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (tries % 2 == 0);
@@ -284,6 +286,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.device = opt.device;
         config.usePrecomputedTables = false;
         config.indicesOptions = opt.indicesOpt;
+        config.use_raft = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (dimPerSubQ == 7);
@@ -312,53 +315,6 @@ TEST(TestGpuIndexIVFPQ, Query_IP_MMCodeDistance) {
     testMMCodeDistance(faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
-TEST(TestGpuIndexIVFPQ, Query_IP) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatIP coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
-
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = false; // not supported/required for IP
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
-    }
-}
-
 TEST(TestGpuIndexIVFPQ, Float16Coarse) {
     Options opt;
 
@@ -384,6 +340,7 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -403,104 +360,68 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
             opt.getPctMaxDiffN());
 }
 
-TEST(TestGpuIndexIVFPQ, Add_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
+void addTest(Options opt, faiss::MetricType metricType) {
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
+    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
+    faiss::Index* quantizer = metricType == faiss::METRIC_L2
+            ? (faiss::Index*)&coarseQuantizerL2
+            : (faiss::Index*)&coarseQuantizerIP;
 
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
+    faiss::IndexIVFPQ cpuIndex(
+            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.metric_type = metricType;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
 
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
+    // Use the default temporary memory management to test the memory
+    // manager
+    faiss::gpu::StandardGpuResources res;
 
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = opt.usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        gpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            opt.getCompareEpsilon(),
+            opt.getPctMaxDiff1(),
+            opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, Add_L2) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        addTest(opt, faiss::METRIC_L2);
     }
 }
 
 TEST(TestGpuIndexIVFPQ, Add_IP) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatIP coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = opt.usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        gpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+        addTest(opt, faiss::METRIC_INNER_PRODUCT);
     }
 }
 
-TEST(TestGpuIndexIVFPQ, CopyTo) {
+void copyToTest(Options opt) {
     for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -511,9 +432,11 @@ TEST(TestGpuIndexIVFPQ, CopyTo) {
 
         faiss::gpu::GpuIndexIVFPQConfig config;
         config.device = opt.device;
-        config.usePrecomputedTables = (tries % 2 == 0);
+        config.usePrecomputedTables = false;
         config.indicesOptions = opt.indicesOpt;
         config.useFloat16LookupTables = opt.useFloat16;
+        config.interleavedLayout = opt.interleavedLayout;
+        config.use_raft = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(
                 &res,
@@ -561,8 +484,12 @@ TEST(TestGpuIndexIVFPQ, CopyTo) {
     }
 }
 
-TEST(TestGpuIndexIVFPQ, CopyFrom) {
+TEST(TestGpuIndexIVFPQ, CopyTo) {
     Options opt;
+    copyToTest(opt);
+}
+
+void copyFromTest(Options opt) {
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
@@ -585,6 +512,8 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
     // Use garbage values to see if we overwrite them
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
@@ -621,9 +550,12 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
             opt.getPctMaxDiffN());
 }
 
-TEST(TestGpuIndexIVFPQ, QueryNaN) {
+TEST(TestGpuIndexIVFPQ, CopyFrom) {
     Options opt;
+    copyFromTest(opt);
+}
 
+void queryNaNTest(Options opt) {
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
@@ -635,6 +567,8 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.use_raft = opt.useRaft;
+    config.interleavedLayout = opt.useRaft ? true : opt.interleavedLayout;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -670,9 +604,13 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
     }
 }
 
-TEST(TestGpuIndexIVFPQ, AddNaN) {
+TEST(TestGpuIndexIVFPQ, QueryNaN) {
     Options opt;
+    opt.useRaft = false;
+    queryNaNTest(opt);
+}
 
+void addNaNTest(Options opt) {
     // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
 
@@ -681,6 +619,8 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -722,6 +662,128 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
             indices.data());
 }
 
+TEST(TestGpuIndexIVFPQ, AddNaN) {
+    Options opt;
+    opt.useRaft = false;
+    addNaNTest(opt);
+}
+
+#if defined USE_NVIDIA_RAFT
+TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        queryTest(opt, faiss::MetricType::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Query_IP_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
+    }
+}
+
+// Large batch sizes (>= 65536) should also work
+TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
+    Options opt;
+
+    // override for large sizes
+    opt.dim = 4;
+    opt.numQuery = 100000;
+    opt.codes = 2;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.usePrecomputed = false;
+    opt.useFloat16 = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    opt.bitsPerCode = 8;
+
+    queryTest(opt, faiss::MetricType::METRIC_L2);
+}
+
+TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    copyFromTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        addTest(opt, faiss::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        addTest(opt, faiss::METRIC_INNER_PRODUCT);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    queryNaNTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, AddNaN_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    addNaNTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, CopyTo_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    copyToTest(opt);
+}
+#endif
+
 TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
@@ -762,6 +824,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -784,6 +847,34 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
             0.015f,
             0.1f,
             0.015f);
+
+#if defined USE_NVIDIA_RAFT
+    config.interleavedLayout = true;
+    config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
+
+    faiss::gpu::GpuIndexIVFPQ raftGpuIndex(
+            &res,
+            dim,
+            numCentroids,
+            codes,
+            bitsPerCode,
+            faiss::METRIC_L2,
+            config);
+    raftGpuIndex.copyFrom(&cpuIndex);
+    raftGpuIndex.nprobe = nprobe;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            raftGpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            0.015f,
+            0.1f,
+            0.015f);
+#endif
 }
 
 int main(int argc, char** argv) {
diff --git a/faiss/gpu/test/TestGpuMemoryException.cpp b/faiss/gpu/test/TestGpuMemoryException.cpp
index c6f6e9bdeb..ff4be0893e 100644
--- a/faiss/gpu/test/TestGpuMemoryException.cpp
+++ b/faiss/gpu/test/TestGpuMemoryException.cpp
@@ -31,6 +31,7 @@ TEST(TestGpuMemoryException, AddException) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2Broken(
             &res, (int)brokenAddDims, config);
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
new file mode 100644
index 0000000000..4c7e532c2b
--- /dev/null
+++ b/faiss/gpu/test/test_cagra.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+
+from faiss.contrib import datasets, evaluation
+
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestComputeGT(unittest.TestCase):
+
+    def do_compute_GT(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+        Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+    def test_compute_GT_L2(self):
+        self.do_compute_GT(faiss.METRIC_L2)
+
+    def test_compute_GT_IP(self):
+        self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestInterop(unittest.TestCase):
+
+    def do_interop(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+
+        cpu_index = faiss.index_gpu_to_cpu(index)
+        Dref, Iref = cpu_index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+        deserialized_index = faiss.deserialize_index(
+            faiss.serialize_index(cpu_index))
+
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
+        Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
+
+        evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k)
+
+    def test_interop_L2(self):
+        self.do_interop(faiss.METRIC_L2)
+
+    def test_interop_IP(self):
+        self.do_interop(faiss.METRIC_INNER_PRODUCT)
diff --git a/faiss/gpu/test/test_gpu_basics.py b/faiss/gpu/test/test_gpu_basics.py
index f3f0a525d4..4b4024d236 100755
--- a/faiss/gpu/test/test_gpu_basics.py
+++ b/faiss/gpu/test/test_gpu_basics.py
@@ -11,6 +11,7 @@
 import random
 from common_faiss_tests import get_dataset_2
 
+
 class ReferencedObject(unittest.TestCase):
 
     d = 16
diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py
index 36a1f8a64b..28572ebcb4 100755
--- a/faiss/gpu/test/test_gpu_index.py
+++ b/faiss/gpu/test/test_gpu_index.py
@@ -24,7 +24,9 @@ def test_ivfflat_search_preassigned(self):
         nprobe = 10
         k = 50
 
-        idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist)
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
         rs = np.random.RandomState(567)
@@ -56,7 +58,9 @@ def test_ivfpq_search_preassigned(self):
         nprobe = 5
         k = 50
 
-        idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8)
+        config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
         rs = np.random.RandomState(567)
@@ -136,7 +140,9 @@ def test_ivfflat_cpu_coarse(self):
 
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
-        idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2)
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2, config)
         assert(idx_gpu.is_trained)
         idx_gpu.add(xb)
 
@@ -150,7 +156,7 @@ def test_ivfflat_cpu_coarse(self):
         self.assertGreaterEqual((i_g == i_c).sum(), i_g.size * 0.9)
         self.assertTrue(np.allclose(d_g, d_c, rtol=5e-5, atol=5e-5))
 
-    def test_ivfsq_cpu_coarse(self):
+    def test_ivfsq_pu_coarse(self):
         res = faiss.StandardGpuResources()
         d = 128
         nb = 5000
@@ -226,8 +232,10 @@ def test_ivfpq_cpu_coarse(self):
 
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
+        config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
         idx_gpu = faiss.GpuIndexIVFPQ(
-            res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8)
+            res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8, faiss.METRIC_L2, config)
         assert(not idx_gpu.is_trained)
 
         idx_gpu.train(xb)
@@ -406,6 +414,7 @@ def test_indices_ivfflat(self):
 
         # Store values using 32-bit indices instead
         config.indicesOptions = faiss.INDICES_32_BIT
+        config.use_raft = False
         idx = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx.train(xb)
         idx.add_with_ids(xb, xb_indices)
@@ -430,6 +439,7 @@ def test_indices_ivfpq(self):
         xb_indices = (xb_indices_base + 4294967296).astype('int64')
 
         config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
         idx = faiss.GpuIndexIVFPQ(res, d, nlist, M, nbits,
                                   faiss.METRIC_L2, config)
         idx.train(xb)
@@ -490,7 +500,9 @@ def test_sq_cpu_to_gpu(self):
         res = faiss.StandardGpuResources()
         index = faiss.index_factory(32, "SQfp16")
         index.add(np.random.rand(1000, 32).astype(np.float32))
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        config = faiss.GpuClonerOptions()
+        config.use_raft = False
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index, config)
         self.assertIsInstance(gpu_index, faiss.GpuIndexFlat)
 
 
@@ -577,7 +589,10 @@ class TestGpuAutoTune(unittest.TestCase):
 
     def test_params(self):
         index = faiss.index_factory(32, "IVF65536_HNSW,PQ16")
-        index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
+        res = faiss.StandardGpuResources()
+        options = faiss.GpuClonerOptions()
+        options.allowCpuCoarseQuantizer = True
+        index = faiss.index_cpu_to_gpu(res, 0, index, options)
         ps = faiss.GpuParameterSpace()
         ps.initialize(index)
         for i in range(ps.parameter_ranges.size()):
diff --git a/faiss/gpu/test/test_gpu_index_ivfflat.py b/faiss/gpu/test/test_gpu_index_ivfflat.py
new file mode 100644
index 0000000000..099615aff5
--- /dev/null
+++ b/faiss/gpu/test/test_gpu_index_ivfflat.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+
+class TestGpuIndexIvfflat(unittest.TestCase):
+    def test_reconstruct_n(self):
+        index = faiss.index_factory(4, "IVF10,Flat")
+        x = np.random.RandomState(123).rand(10, 4).astype('float32')
+        index.train(x)
+        index.add(x)
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        index2 = faiss.GpuIndexIVFFlat(res, index, config)
+        recons = index2.reconstruct_n(0, 10)
+
+        np.testing.assert_array_equal(recons, x)
diff --git a/faiss/gpu/test/test_gpu_index_ivfsq.py b/faiss/gpu/test/test_gpu_index_ivfsq.py
index af56316509..09dcdae079 100755
--- a/faiss/gpu/test/test_gpu_index_ivfsq.py
+++ b/faiss/gpu/test/test_gpu_index_ivfsq.py
@@ -27,7 +27,9 @@ def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
-    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu)
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
+    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu, config)
 
     return idx_cpu, idx_gpu
 
@@ -37,8 +39,10 @@ def make_indices_copy_from_gpu(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
-                                               qtype, metric, by_residual)
+                                               qtype, metric, by_residual, config)
     idx_gpu.train(to_train)
     idx_gpu.add(to_train)
 
@@ -63,8 +67,10 @@ def make_indices_train(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
-                                               qtype, metric, by_residual)
+                                               qtype, metric, by_residual, config)
     assert(by_residual == idx_gpu.by_residual)
 
     idx_gpu.train(to_train)
diff --git a/faiss/gpu/test/test_gpu_index_serialize.py b/faiss/gpu/test/test_gpu_index_serialize.py
index 82cbe577c8..49e51af8b4 100644
--- a/faiss/gpu/test/test_gpu_index_serialize.py
+++ b/faiss/gpu/test/test_gpu_index_serialize.py
@@ -34,7 +34,9 @@ def test_serialize(self):
         indexes.append(faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2))
 
         # IVFSQ
-        indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16))
+        config = faiss.GpuIndexIVFScalarQuantizerConfig()
+        config.use_raft = False
+        indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16, faiss.METRIC_L2, True, config))
 
         # IVFPQ
         indexes.append(faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2))
@@ -47,8 +49,11 @@ def test_serialize(self):
 
             ser = faiss.serialize_index(faiss.index_gpu_to_cpu(index))
             cpu_index = faiss.deserialize_index(ser)
-
-            gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+             
+            gpu_cloner_options = faiss.GpuClonerOptions()
+            if isinstance(index, faiss.GpuIndexIVFScalarQuantizer):
+                gpu_cloner_options.use_raft = False
+            gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index, gpu_cloner_options)
 
             restore_d, restore_i = gpu_index_restore.search(query, k)
 
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
new file mode 100644
index 0000000000..088ea2bf74
--- /dev/null
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -0,0 +1,89 @@
+import numpy as np
+import unittest
+import faiss
+
+
+class TestMoveToGpu(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.res = faiss.StandardGpuResources()
+
+    def create_index(self, factory_string):
+        dimension = 128
+        n = 2500
+        db_vectors = np.random.random((n, dimension)).astype('float32')
+        index = faiss.index_factory(dimension, factory_string)
+        index.train(db_vectors)
+        if factory_string.startswith("IDMap"):
+            index.add_with_ids(db_vectors, np.arange(n))
+        else:
+            index.add(db_vectors)
+        return index
+
+    def create_and_clone(self, factory_string,
+                         allowCpuCoarseQuantizer=None,
+                         use_raft=None):
+        idx = self.create_index(factory_string)
+        config = faiss.GpuClonerOptions()
+        if allowCpuCoarseQuantizer is not None:
+            config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer
+        if use_raft is not None:
+            config.use_raft = use_raft
+        faiss.index_cpu_to_gpu(self.res, 0, idx, config)
+
+    def verify_throws_not_implemented_exception(self, factory_string):
+        try:
+            self.create_and_clone(factory_string)
+        except Exception as e:
+            if "not implemented" not in str(e):
+                self.fail("Expected an exception but no exception was "
+                          "thrown for factory_string: %s." % factory_string)
+
+    def verify_clones_successfully(self, factory_string,
+                                   allowCpuCoarseQuantizer=None,
+                                   use_raft=None):
+        try:
+            self.create_and_clone(
+                factory_string,
+                allowCpuCoarseQuantizer=allowCpuCoarseQuantizer,
+                use_raft=use_raft)
+        except Exception as e:
+            self.fail("Unexpected exception thrown factory_string: "
+                      "%s; error message: %s." % (factory_string, str(e)))
+
+    def test_not_implemented_indices(self):
+        self.verify_throws_not_implemented_exception("PQ16")
+        self.verify_throws_not_implemented_exception("LSHrt")
+        self.verify_throws_not_implemented_exception("HNSW")
+        self.verify_throws_not_implemented_exception("HNSW,PQ16")
+        self.verify_throws_not_implemented_exception("IDMap,PQ16")
+        self.verify_throws_not_implemented_exception("IVF256,ITQ64,SH1.2")
+
+    def test_implemented_indices(self):
+        self.verify_clones_successfully("Flat")
+        self.verify_clones_successfully("IVF1,Flat")
+        self.verify_clones_successfully("IVF32,PQ8")
+        self.verify_clones_successfully("IDMap,Flat")
+        self.verify_clones_successfully("PCA12,IVF32,Flat")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8np")
+
+        # set use_raft to false, these index types are not supported on RAFT
+        self.verify_clones_successfully("IVF32,SQ8", use_raft=False)
+        self.verify_clones_successfully(
+            "PCA32,IVF32,SQ8", use_raft=False)
+
+    def test_with_flag(self):
+        self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                        allowCpuCoarseQuantizer=True)
+        self.verify_clones_successfully("IVF256(PQ2x4fs),Flat",
+                                        allowCpuCoarseQuantizer=True)
+
+    def test_with_flag_set_to_false(self):
+        try:
+            self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                            allowCpuCoarseQuantizer=False)
+        except Exception as e:
+            if "set the flag to true to allow the CPU fallback" not in str(e):
+                self.fail("Unexepected error message thrown: %s." % str(e))
diff --git a/faiss/gpu/test/test_multi_gpu.py b/faiss/gpu/test/test_multi_gpu.py
index 4a63025969..e341f5715a 100644
--- a/faiss/gpu/test/test_multi_gpu.py
+++ b/faiss/gpu/test/test_multi_gpu.py
@@ -29,6 +29,7 @@ def test_sharded(self):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
+        co.use_raft = False
         index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
 
         index.add(xb)
@@ -71,6 +72,7 @@ def do_test_sharded_ivf(self, index_key):
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
         co.common_ivf_quantizer = True
+        co.use_raft = False
         index = faiss.index_cpu_to_all_gpus(index, co, ngpu=2)
 
         index.quantizer  # make sure there is indeed a quantizer
@@ -111,6 +113,7 @@ def test_binary_clone(self, ngpu=1, shard=False):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = shard
+        co.use_raft = False
 
         # index2 = faiss.index_cpu_to_all_gpus(index, ngpu=ngpu)
         res = faiss.StandardGpuResources()
@@ -188,7 +191,9 @@ def do_cpu_to_gpu(self, index_key):
         ts.append(time.time())
 
         res = faiss.StandardGpuResources()
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        co = faiss.GpuClonerOptions()
+        co.use_raft = False
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index, co)
         ts.append(time.time())
 
         # Validate the layout of the memory info
@@ -217,6 +222,7 @@ def do_cpu_to_gpu(self, index_key):
             res = [faiss.StandardGpuResources() for i in range(2)]
             co = faiss.GpuMultipleClonerOptions()
             co.shard = shard
+            co.use_raft = False
 
             gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
 
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 1510b10f1d..f7444337f1 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -108,7 +108,7 @@ def test_train_add_with_ids(self):
         self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
 
     # tests reconstruct, reconstruct_n
-    def test_reconstruct(self):
+    def test_flat_reconstruct(self):
         d = 32
         res = faiss.StandardGpuResources()
         res.noTempMemory()
@@ -157,6 +157,40 @@ def test_reconstruct(self):
         index.reconstruct_n(50, 10, y)
         self.assertTrue(torch.equal(xb[50:60], y))
 
+    def test_ivfflat_reconstruct(self):
+        d = 32
+        nlist = 5
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+
+        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
+
+        xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.train(xb)
+        index.add(xb)
+
+        # Test reconstruct_n with torch gpu (native return)
+        y = index.reconstruct_n(10, 10)
+        self.assertTrue(y.is_cuda)
+        self.assertTrue(torch.equal(xb[10:20], y))
+
+        # Test reconstruct with numpy output provided
+        y = np.empty((10, d), dtype='float32')
+        index.reconstruct_n(20, 10, y)
+        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
+
+        # Test reconstruct_n with torch cpu output provided
+        y = torch.empty(10, d, dtype=torch.float32)
+        index.reconstruct_n(40, 10, y)
+        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
+
+        # Test reconstruct_n with torch gpu output provided
+        y = torch.empty(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.reconstruct_n(50, 10, y)
+        self.assertTrue(torch.equal(xb[50:60], y))
+
     # tests assign
     def test_assign(self):
         d = 32
@@ -215,7 +249,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self):
+    def test_knn_gpu(self, use_raft=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -252,7 +286,7 @@ def test_knn_gpu(self):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -278,7 +312,7 @@ def test_knn_gpu(self):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -286,7 +320,7 @@ def test_knn_gpu(self):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -297,7 +331,13 @@ def test_knn_gpu(self):
                         self.assertTrue(torch.equal(I.cpu(), gt_I[6:8]))
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
-    def test_knn_gpu_datatypes(self):
+    @unittest.skipUnless(
+        "RAFT" in faiss.get_compile_options(),
+        "only if RAFT is compiled in")
+    def test_knn_gpu_raft(self):
+        self.test_knn_gpu(use_raft=True)
+
+    def test_knn_gpu_datatypes(self, use_raft=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -320,7 +360,7 @@ def test_knn_gpu_datatypes(self):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -332,7 +372,7 @@ def test_knn_gpu_datatypes(self):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)
diff --git a/faiss/gpu/utils/CopyUtils.cuh b/faiss/gpu/utils/CopyUtils.cuh
index 637a46cbee..8ff600a049 100644
--- a/faiss/gpu/utils/CopyUtils.cuh
+++ b/faiss/gpu/utils/CopyUtils.cuh
@@ -119,6 +119,7 @@ inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) {
     if (dev == -1) {
         CUDA_VERIFY(cudaMemcpyAsync(
                 dst, src, num * sizeof(T), cudaMemcpyDeviceToHost, stream));
+        cudaStreamSynchronize(stream);
     } else {
         CUDA_VERIFY(cudaMemcpyAsync(
                 dst, src, num * sizeof(T), cudaMemcpyDeviceToDevice, stream));
diff --git a/faiss/gpu/utils/DeviceVector.cuh b/faiss/gpu/utils/DeviceVector.cuh
index fff5a79086..51cb7c8d37 100644
--- a/faiss/gpu/utils/DeviceVector.cuh
+++ b/faiss/gpu/utils/DeviceVector.cuh
@@ -132,7 +132,7 @@ class DeviceVector {
     bool resize(size_t newSize, cudaStream_t stream) {
         bool mem = false;
 
-        if (num_ < newSize) {
+        if (newSize > capacity_) {
             mem = reserve(getNewCapacity_(newSize), stream);
         }
 
@@ -169,6 +169,8 @@ class DeviceVector {
         T out;
         CUDA_VERIFY(cudaMemcpyAsync(
                 &out, data() + idx, sizeof(T), cudaMemcpyDeviceToHost, stream));
+
+        return out;
     }
 
     // Clean up after oversized allocations, while leaving some space to
@@ -249,7 +251,7 @@ class DeviceVector {
         if (preferredSize <= kDeviceVector_2x_Limit) {
             return utils::nextHighestPowerOf2(preferredSize);
         } else if (preferredSize <= kDeviceVector_1_25x_Limit) {
-            return preferredSize + (preferredSize << 2);
+            return preferredSize + (preferredSize >> 2);
         } else {
             return preferredSize;
         }
diff --git a/faiss/gpu/utils/RaftUtils.cu b/faiss/gpu/utils/RaftUtils.cu
new file mode 100644
index 0000000000..ba40c54c26
--- /dev/null
+++ b/faiss/gpu/utils/RaftUtils.cu
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/gather.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+#include <thrust/reduce.h>
+
+namespace faiss {
+namespace gpu {
+
+void validRowIndices(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        bool* validRows) {
+    idx_t n_rows = vecs.getSize(0);
+    idx_t dim = vecs.getSize(1);
+
+    raft::linalg::coalescedReduction(
+            validRows,
+            vecs.data(),
+            dim,
+            n_rows,
+            true,
+            res->getDefaultStreamCurrentDevice(),
+            false,
+            [] __device__(float v, idx_t i) { return isfinite(v); },
+            raft::mul_op());
+}
+
+idx_t inplaceGatherFilteredRows(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices) {
+    raft::device_resources& raft_handle = res->getRaftHandleCurrentDevice();
+    idx_t n_rows = vecs.getSize(0);
+    idx_t dim = vecs.getSize(1);
+
+    auto valid_rows =
+            raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
+
+    validRowIndices(res, vecs, valid_rows.data_handle());
+
+    idx_t n_rows_valid = thrust::reduce(
+            raft_handle.get_thrust_policy(),
+            valid_rows.data_handle(),
+            valid_rows.data_handle() + n_rows,
+            0);
+
+    if (n_rows_valid < n_rows) {
+        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        auto count = thrust::make_counting_iterator(0);
+
+        thrust::copy_if(
+                raft_handle.get_thrust_policy(),
+                count,
+                count + n_rows,
+                gather_indices.data_handle(),
+                [valid_rows = valid_rows.data_handle()] __device__(auto i) {
+                    return valid_rows[i];
+                });
+
+        raft::matrix::gather(
+                raft_handle,
+                raft::make_device_matrix_view<float, idx_t>(
+                        vecs.data(), n_rows, dim),
+                raft::make_const_mdspan(gather_indices.view()),
+                (idx_t)16);
+
+        auto validIndices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        thrust::gather(
+                raft_handle.get_thrust_policy(),
+                gather_indices.data_handle(),
+                gather_indices.data_handle() + gather_indices.size(),
+                indices.data(),
+                validIndices.data_handle());
+        thrust::copy(
+                raft_handle.get_thrust_policy(),
+                validIndices.data_handle(),
+                validIndices.data_handle() + n_rows_valid,
+                indices.data());
+    }
+    return n_rows_valid;
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftUtils.h b/faiss/gpu/utils/RaftUtils.h
similarity index 73%
rename from faiss/gpu/impl/RaftUtils.h
rename to faiss/gpu/utils/RaftUtils.h
index f1ea19ed33..4dfafa4ec5 100644
--- a/faiss/gpu/impl/RaftUtils.h
+++ b/faiss/gpu/utils/RaftUtils.h
@@ -23,13 +23,16 @@
 #pragma once
 
 #include <faiss/MetricType.h>
-#include <raft/core/error.hpp>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
 #include <raft/distance/distance_types.hpp>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
-inline raft::distance::DistanceType faiss_to_raft(
+inline raft::distance::DistanceType metricFaissToRaft(
         MetricType metric,
         bool exactDistance) {
     switch (metric) {
@@ -53,5 +56,20 @@ inline raft::distance::DistanceType faiss_to_raft(
             RAFT_FAIL("Distance type not supported");
     }
 }
+
+/// Identify matrix rows containing non NaN values. validRows[i] is false if row
+/// i contains a NaN value and true otherwise.
+void validRowIndices(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        bool* validRows);
+
+/// Filter out matrix rows containing NaN values. The vectors and indices are
+/// updated in-place.
+idx_t inplaceGatherFilteredRows(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices);
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/utils/Tensor.cuh b/faiss/gpu/utils/Tensor.cuh
index b13d0e1496..0fbb2417b3 100644
--- a/faiss/gpu/utils/Tensor.cuh
+++ b/faiss/gpu/utils/Tensor.cuh
@@ -232,13 +232,12 @@ class Tensor {
     }
 
     /// Returns a read/write view of a portion of our tensor.
-    __host__ __device__ inline detail::SubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
+    __host__ __device__ inline detail::
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT);
 
     /// Returns a read/write view of a portion of our tensor (const).
     __host__ __device__ inline const detail::
-            SubTensor<TensorType, Dim - 1, PtrTraits>
-            operator[](IndexT) const;
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT) const;
 
     /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
     /// checking.
diff --git a/faiss/impl/AuxIndexStructures.cpp b/faiss/impl/AuxIndexStructures.cpp
index cebe8a1e23..e2b2791e55 100644
--- a/faiss/impl/AuxIndexStructures.cpp
+++ b/faiss/impl/AuxIndexStructures.cpp
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
     return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
 }
 
+void TimeoutCallback::set_timeout(double timeout_in_seconds) {
+    timeout = timeout_in_seconds;
+    start = std::chrono::steady_clock::now();
+}
+
+bool TimeoutCallback::want_interrupt() {
+    if (timeout == 0) {
+        return false;
+    }
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<float, std::milli> duration = end - start;
+    float elapsed_in_seconds = duration.count() / 1000.0;
+    if (elapsed_in_seconds > timeout) {
+        timeout = 0;
+        return true;
+    }
+    return false;
+}
+
+void TimeoutCallback::reset(double timeout_in_seconds) {
+    auto tc(new faiss::TimeoutCallback());
+    faiss::InterruptCallback::instance.reset(tc);
+    tc->set_timeout(timeout_in_seconds);
+}
+
 } // namespace faiss
diff --git a/faiss/impl/AuxIndexStructures.h b/faiss/impl/AuxIndexStructures.h
index 344a708b78..7e12a1a3af 100644
--- a/faiss/impl/AuxIndexStructures.h
+++ b/faiss/impl/AuxIndexStructures.h
@@ -41,7 +41,6 @@ struct RangeSearchResult {
 
     /// called when lims contains the nb of elements result entries
     /// for each query
-
     virtual void do_allocation();
 
     virtual ~RangeSearchResult();
@@ -162,6 +161,14 @@ struct FAISS_API InterruptCallback {
     static size_t get_period_hint(size_t flops);
 };
 
+struct TimeoutCallback : InterruptCallback {
+    std::chrono::time_point<std::chrono::steady_clock> start;
+    double timeout;
+    bool want_interrupt() override;
+    void set_timeout(double timeout_in_seconds);
+    static void reset(double timeout_in_seconds);
+};
+
 /// set implementation optimized for fast access.
 struct VisitedTable {
     std::vector<uint8_t> visited;
diff --git a/faiss/impl/DistanceComputer.h b/faiss/impl/DistanceComputer.h
index dc46d113fb..5ac3a702c9 100644
--- a/faiss/impl/DistanceComputer.h
+++ b/faiss/impl/DistanceComputer.h
@@ -59,6 +59,52 @@ struct DistanceComputer {
     virtual ~DistanceComputer() {}
 };
 
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer : DistanceComputer {
+    /// owned by this
+    DistanceComputer* basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer* basedis)
+            : basedis(basedis) {}
+
+    void set_query(const float* x) override {
+        basedis->set_query(x);
+    }
+
+    /// compute distance of vector i to current query
+    float operator()(idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) override {
+        basedis->distances_batch_4(
+                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
+        dis0 = -dis0;
+        dis1 = -dis1;
+        dis2 = -dis2;
+        dis3 = -dis3;
+    }
+
+    /// compute distance between two stored vectors
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer() {
+        delete basedis;
+    }
+};
+
 /*************************************************************
  * Specialized version of the DistanceComputer when we know that codes are
  * laid out in a flat index.
diff --git a/faiss/impl/FaissAssert.h b/faiss/impl/FaissAssert.h
index 6f666f684c..9d357823d0 100644
--- a/faiss/impl/FaissAssert.h
+++ b/faiss/impl/FaissAssert.h
@@ -94,13 +94,15 @@
         }                                              \
     } while (false)
 
-#define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
+#define FAISS_THROW_IF_MSG(X, MSG)                           \
     do {                                                     \
-        if (!(X)) {                                          \
+        if (X) {                                             \
             FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
         }                                                    \
     } while (false)
 
+#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
+
 #define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                               \
     do {                                                                  \
         if (!(X)) {                                                       \
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 9fc201ea39..3ba5f72f68 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -5,15 +5,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/impl/HNSW.h>
 
+#include <cstddef>
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/utils/prefetch.h>
 
 #include <faiss/impl/platform_macros.h>
@@ -111,8 +111,8 @@ void HNSW::print_neighbor_stats(int level) const {
            level,
            nb_neighbors(level));
     size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
-  reduction(+: tot_reciprocal) reduction(+: n_node)
+#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
+        reduction(+ : tot_reciprocal) reduction(+ : n_node)
     for (int i = 0; i < levels.size(); i++) {
         if (levels[i] > level) {
             n_node++;
@@ -216,8 +216,8 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
         if (pt_level > max_level)
             max_level = pt_level;
         offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-        neighbors.resize(offsets.back(), -1);
     }
+    neighbors.resize(offsets.back(), -1);
 
     return max_level;
 }
@@ -230,7 +230,14 @@ void HNSW::shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistFarther>& input,
         std::vector<NodeDistFarther>& output,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0) {
+    // This prevents number of neighbors at
+    // level 0 from being shrunk to less than 2 * M.
+    // This is essential in making sure
+    // `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
+    std::vector<NodeDistFarther> outsiders;
+
     while (input.size() > 0) {
         NodeDistFarther v1 = input.top();
         input.pop();
@@ -251,8 +258,15 @@ void HNSW::shrink_neighbor_list(
             if (output.size() >= max_size) {
                 return;
             }
+        } else if (keep_max_size_level0) {
+            outsiders.push_back(v1);
         }
     }
+    size_t idx = 0;
+    while (keep_max_size_level0 && (output.size() < max_size) &&
+           (idx < outsiders.size())) {
+        output.push_back(outsiders[idx++]);
+    }
 }
 
 namespace {
@@ -269,7 +283,8 @@ using NodeDistFarther = HNSW::NodeDistFarther;
 void shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0 = false) {
     if (resultSet1.size() < max_size) {
         return;
     }
@@ -281,7 +296,8 @@ void shrink_neighbor_list(
         resultSet1.pop();
     }
 
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+    HNSW::shrink_neighbor_list(
+            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
 
     for (NodeDistFarther curen2 : returnlist) {
         resultSet1.emplace(curen2.d, curen2.id);
@@ -295,7 +311,8 @@ void add_link(
         DistanceComputer& qdis,
         storage_idx_t src,
         storage_idx_t dest,
-        int level) {
+        int level,
+        bool keep_max_size_level0 = false) {
     size_t begin, end;
     hnsw.neighbor_range(src, level, &begin, &end);
     if (hnsw.neighbors[end - 1] == -1) {
@@ -320,7 +337,7 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(qdis, resultSet, end - begin);
+    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -430,7 +447,8 @@ void HNSW::add_links_starting_from(
         float d_nearest,
         int level,
         omp_lock_t* locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     std::priority_queue<NodeDistCloser> link_targets;
 
     search_neighbors_to_add(
@@ -439,13 +457,13 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+    ::faiss::shrink_neighbor_list(ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
     while (!link_targets.empty()) {
         storage_idx_t other_id = link_targets.top().id;
-        add_link(*this, ptdis, pt_id, other_id, level);
+        add_link(*this, ptdis, pt_id, other_id, level, keep_max_size_level0);
         neighbors.push_back(other_id);
         link_targets.pop();
     }
@@ -453,7 +471,7 @@ void HNSW::add_links_starting_from(
     omp_unset_lock(&locks[pt_id]);
     for (storage_idx_t other_id : neighbors) {
         omp_set_lock(&locks[other_id]);
-        add_link(*this, ptdis, other_id, pt_id, level);
+        add_link(*this, ptdis, other_id, pt_id, level, keep_max_size_level0);
         omp_unset_lock(&locks[other_id]);
     }
     omp_set_lock(&locks[pt_id]);
@@ -468,7 +486,8 @@ void HNSW::add_with_locks(
         int pt_level,
         int pt_id,
         std::vector<omp_lock_t>& locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     //  greedy search on upper levels
 
     storage_idx_t nearest;
@@ -497,7 +516,14 @@ void HNSW::add_with_locks(
 
     for (; level >= 0; level--) {
         add_links_starting_from(
-                ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt);
+                ptdis,
+                pt_id,
+                nearest,
+                d_nearest,
+                level,
+                locks.data(),
+                vt,
+                keep_max_size_level0);
     }
 
     omp_unset_lock(&locks[pt_id]);
@@ -513,17 +539,15 @@ void HNSW::add_with_locks(
  **************************************************************/
 
 namespace {
-
 using MinimaxHeap = HNSW::MinimaxHeap;
 using Node = HNSW::Node;
+using C = HNSW::C;
 /** Do a BFS on the candidates list */
 
 int search_from_candidates(
         const HNSW& hnsw,
         DistanceComputer& qdis,
-        int k,
-        idx_t* I,
-        float* D,
+        ResultHandler<C>& res,
         MinimaxHeap& candidates,
         VisitedTable& vt,
         HNSWStats& stats,
@@ -539,15 +563,16 @@ int search_from_candidates(
     int efSearch = params ? params->efSearch : hnsw.efSearch;
     const IDSelector* sel = params ? params->sel : nullptr;
 
+    C::T threshold = res.threshold;
     for (int i = 0; i < candidates.size(); i++) {
         idx_t v1 = candidates.ids[i];
         float d = candidates.dis[i];
         FAISS_ASSERT(v1 >= 0);
         if (!sel || sel->is_member(v1)) {
-            if (nres < k) {
-                faiss::maxheap_push(++nres, D, I, d, v1);
-            } else if (d < D[0]) {
-                faiss::maxheap_replace_top(nres, D, I, d, v1);
+            if (d < threshold) {
+                if (res.add_result(d, v1)) {
+                    threshold = res.threshold;
+                }
             }
         }
         vt.set(v1);
@@ -609,13 +634,14 @@ int search_from_candidates(
         size_t saved_j[4];
 
         ndis += jmax - begin;
+        threshold = res.threshold;
 
         auto add_to_heap = [&](const size_t idx, const float dis) {
             if (!sel || sel->is_member(idx)) {
-                if (nres < k) {
-                    faiss::maxheap_push(++nres, D, I, dis, idx);
-                } else if (dis < D[0]) {
-                    faiss::maxheap_replace_top(nres, D, I, dis, idx);
+                if (dis < threshold) {
+                    if (res.add_result(dis, idx)) {
+                        threshold = res.threshold;
+                    }
                 }
             }
             candidates.push(idx, dis);
@@ -665,7 +691,7 @@ int search_from_candidates(
         if (candidates.size() == 0) {
             stats.n2++;
         }
-        stats.n3 += ndis;
+        stats.ndis += ndis;
     }
 
     return nres;
@@ -794,24 +820,33 @@ std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
     if (candidates.size() == 0) {
         ++stats.n2;
     }
-    stats.n3 += ndis;
+    stats.ndis += ndis;
 
     return top_candidates;
 }
 
+// just used as a lower bound for the minmaxheap, but it is set for heap search
+int extract_k_from_ResultHandler(ResultHandler<C>& res) {
+    using RH = HeapBlockResultHandler<C>;
+    if (auto hres = dynamic_cast<RH::SingleResultHandler*>(&res)) {
+        return hres->k;
+    }
+    return 1;
+}
+
 } // anonymous namespace
 
 HNSWStats HNSW::search(
         DistanceComputer& qdis,
-        int k,
-        idx_t* I,
-        float* D,
+        ResultHandler<C>& res,
         VisitedTable& vt,
         const SearchParametersHNSW* params) const {
     HNSWStats stats;
     if (entry_point == -1) {
         return stats;
     }
+    int k = extract_k_from_ResultHandler(res);
+
     if (upper_beam == 1) {
         //  greedy search on upper levels
         storage_idx_t nearest = entry_point;
@@ -821,14 +856,14 @@ HNSWStats HNSW::search(
             greedy_update_nearest(*this, qdis, level, nearest, d_nearest);
         }
 
-        int ef = std::max(efSearch, k);
+        int ef = std::max(params ? params->efSearch : efSearch, k);
         if (search_bounded_queue) { // this is the most common branch
             MinimaxHeap candidates(ef);
 
             candidates.push(nearest, d_nearest);
 
             search_from_candidates(
-                    *this, qdis, k, I, D, candidates, vt, stats, 0, 0, params);
+                    *this, qdis, res, candidates, vt, stats, 0, 0, params);
         } else {
             std::priority_queue<Node> top_candidates =
                     search_from_candidate_unbounded(
@@ -843,12 +878,11 @@ HNSWStats HNSW::search(
                 top_candidates.pop();
             }
 
-            int nres = 0;
             while (!top_candidates.empty()) {
                 float d;
                 storage_idx_t label;
                 std::tie(d, label) = top_candidates.top();
-                faiss::maxheap_push(++nres, D, I, d, label);
+                res.add_result(d, label);
                 top_candidates.pop();
             }
         }
@@ -862,6 +896,10 @@ HNSWStats HNSW::search(
         std::vector<idx_t> I_to_next(candidates_size);
         std::vector<float> D_to_next(candidates_size);
 
+        HeapBlockResultHandler<C> block_resh(
+                1, D_to_next.data(), I_to_next.data(), candidates_size);
+        HeapBlockResultHandler<C>::SingleResultHandler resh(block_resh);
+
         int nres = 1;
         I_to_next[0] = entry_point;
         D_to_next[0] = qdis(entry_point);
@@ -877,18 +915,12 @@ HNSWStats HNSW::search(
 
             if (level == 0) {
                 nres = search_from_candidates(
-                        *this, qdis, k, I, D, candidates, vt, stats, 0);
+                        *this, qdis, res, candidates, vt, stats, 0);
             } else {
+                resh.begin(0);
                 nres = search_from_candidates(
-                        *this,
-                        qdis,
-                        candidates_size,
-                        I_to_next.data(),
-                        D_to_next.data(),
-                        candidates,
-                        vt,
-                        stats,
-                        level);
+                        *this, qdis, resh, candidates, vt, stats, level);
+                resh.end();
             }
             vt.advance();
         }
@@ -899,16 +931,17 @@ HNSWStats HNSW::search(
 
 void HNSW::search_level_0(
         DistanceComputer& qdis,
-        int k,
-        idx_t* idxi,
-        float* simi,
+        ResultHandler<C>& res,
         idx_t nprobe,
         const storage_idx_t* nearest_i,
         const float* nearest_d,
         int search_type,
         HNSWStats& search_stats,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     const HNSW& hnsw = *this;
+    auto efSearch = params ? params->efSearch : hnsw.efSearch;
+    int k = extract_k_from_ResultHandler(res);
 
     if (search_type == 1) {
         int nres = 0;
@@ -922,7 +955,7 @@ void HNSW::search_level_0(
             if (vt.get(cj))
                 continue;
 
-            int candidates_size = std::max(hnsw.efSearch, int(k));
+            int candidates_size = std::max(efSearch, k);
             MinimaxHeap candidates(candidates_size);
 
             candidates.push(cj, nearest_d[j]);
@@ -930,17 +963,16 @@ void HNSW::search_level_0(
             nres = search_from_candidates(
                     hnsw,
                     qdis,
-                    k,
-                    idxi,
-                    simi,
+                    res,
                     candidates,
                     vt,
                     search_stats,
                     0,
-                    nres);
+                    nres,
+                    params);
         }
     } else if (search_type == 2) {
-        int candidates_size = std::max(hnsw.efSearch, int(k));
+        int candidates_size = std::max(efSearch, int(k));
         candidates_size = std::max(candidates_size, int(nprobe));
 
         MinimaxHeap candidates(candidates_size);
@@ -953,7 +985,7 @@ void HNSW::search_level_0(
         }
 
         search_from_candidates(
-                hnsw, qdis, k, idxi, simi, candidates, vt, search_stats, 0);
+                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
     }
 }
 
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index c923e0a6ae..f3aacf8a5b 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #pragma once
 
 #include <queue>
@@ -42,6 +40,8 @@ namespace faiss {
 struct VisitedTable;
 struct DistanceComputer; // from AuxIndexStructures
 struct HNSWStats;
+template <class C>
+struct ResultHandler;
 
 struct SearchParametersHNSW : SearchParameters {
     int efSearch = 16;
@@ -54,6 +54,9 @@ struct HNSW {
     /// internal storage of vectors (32 bits: this is expensive)
     using storage_idx_t = int32_t;
 
+    // for now we do only these distances
+    using C = CMax<float, int64_t>;
+
     typedef std::pair<float, storage_idx_t> Node;
 
     /** Heap structure that allows fast
@@ -181,7 +184,8 @@ struct HNSW {
             float d_nearest,
             int level,
             omp_lock_t* locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /** add point pt_id on all levels <= pt_level and build the link
      * structure for them. */
@@ -190,29 +194,27 @@ struct HNSW {
             int pt_level,
             int pt_id,
             std::vector<omp_lock_t>& locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /// search interface for 1 point, single thread
     HNSWStats search(
             DistanceComputer& qdis,
-            int k,
-            idx_t* I,
-            float* D,
+            ResultHandler<C>& res,
             VisitedTable& vt,
             const SearchParametersHNSW* params = nullptr) const;
 
     /// search only in level 0 from a given vertex
     void search_level_0(
             DistanceComputer& qdis,
-            int k,
-            idx_t* idxi,
-            float* simi,
+            ResultHandler<C>& res,
             idx_t nprobe,
             const storage_idx_t* nearest_i,
             const float* nearest_d,
             int search_type,
             HNSWStats& search_stats,
-            VisitedTable& vt) const;
+            VisitedTable& vt,
+            const SearchParametersHNSW* params = nullptr) const;
 
     void reset();
 
@@ -225,36 +227,27 @@ struct HNSW {
             DistanceComputer& qdis,
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
-            int max_size);
+            int max_size,
+            bool keep_max_size_level0 = false);
 
     void permute_entries(const idx_t* map);
 };
 
 struct HNSWStats {
-    size_t n1, n2, n3;
-    size_t ndis;
-    size_t nreorder;
-
-    HNSWStats(
-            size_t n1 = 0,
-            size_t n2 = 0,
-            size_t n3 = 0,
-            size_t ndis = 0,
-            size_t nreorder = 0)
-            : n1(n1), n2(n2), n3(n3), ndis(ndis), nreorder(nreorder) {}
+    size_t n1 = 0; /// numbner of vectors searched
+    size_t n2 =
+            0; /// number of queries for which the candidate list is exhasted
+    size_t ndis = 0; /// number of distances computed
 
     void reset() {
-        n1 = n2 = n3 = 0;
+        n1 = n2 = 0;
         ndis = 0;
-        nreorder = 0;
     }
 
     void combine(const HNSWStats& other) {
         n1 += other.n1;
         n2 += other.n2;
-        n3 += other.n3;
         ndis += other.ndis;
-        nreorder += other.nreorder;
     }
 };
 
diff --git a/faiss/impl/LocalSearchQuantizer.cpp b/faiss/impl/LocalSearchQuantizer.cpp
index abbfe74901..943fe32c9d 100644
--- a/faiss/impl/LocalSearchQuantizer.cpp
+++ b/faiss/impl/LocalSearchQuantizer.cpp
@@ -104,10 +104,10 @@ int dgemm_(
 
 namespace {
 
-void fmat_inverse(float* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void fmat_inverse(float* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<float> workspace(lwork);
 
     sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
@@ -123,10 +123,10 @@ void dfvec_add(size_t d, const double* a, const float* b, double* c) {
     }
 }
 
-void dmat_inverse(double* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void dmat_inverse(double* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<double> workspace(lwork);
 
     dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
@@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
                         {
                             size_t binary_idx = (other_m + 1) * M * K * K +
                                     m * K * K + code2 * K + code;
-                            _mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
+                            _mm_prefetch(
+                                    (const char*)(binaries + binary_idx),
+                                    _MM_HINT_T0);
                         }
                     }
 #endif
diff --git a/faiss/impl/LookupTableScaler.h b/faiss/impl/LookupTableScaler.h
index c553a0f14d..b6438307fb 100644
--- a/faiss/impl/LookupTableScaler.h
+++ b/faiss/impl/LookupTableScaler.h
@@ -38,6 +38,23 @@ struct DummyScaler {
         return simd16uint16(0);
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8&, const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
+        return simd64uint8(0);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
+        return simd32uint16(0);
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
+        return simd32uint16(0);
+    }
+#endif
+
     template <class dist_t>
     inline dist_t scale_one(const dist_t&) const {
         FAISS_THROW_MSG("DummyScaler::scale_one should not be called.");
@@ -67,6 +84,23 @@ struct NormTableScaler {
         return (simd16uint16(res) >> 8) * scale_simd;
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8& lut, const simd64uint8& c)
+            const {
+        return lut.lookup_4_lanes(c);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return simd32uint16(res) * scale_simd_wide;
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return (simd32uint16(res) >> 8) * scale_simd_wide;
+    }
+#endif
+
     // for non-SIMD implem 2, 3, 4
     template <class dist_t>
     inline dist_t scale_one(const dist_t& x) const {
diff --git a/faiss/impl/NNDescent.cpp b/faiss/impl/NNDescent.cpp
index 8878349ff6..5afcdaf5b7 100644
--- a/faiss/impl/NNDescent.cpp
+++ b/faiss/impl/NNDescent.cpp
@@ -154,15 +154,20 @@ NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
 NNDescent::~NNDescent() {}
 
 void NNDescent::join(DistanceComputer& qdis) {
+    idx_t check_period = InterruptCallback::get_period_hint(d * search_L);
+    for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal);
 #pragma omp parallel for default(shared) schedule(dynamic, 100)
-    for (int n = 0; n < ntotal; n++) {
-        graph[n].join([&](int i, int j) {
-            if (i != j) {
-                float dist = qdis.symmetric_dis(i, j);
-                graph[i].insert(j, dist);
-                graph[j].insert(i, dist);
-            }
-        });
+        for (idx_t n = i0; n < i1; n++) {
+            graph[n].join([&](int i, int j) {
+                if (i != j) {
+                    float dist = qdis.symmetric_dis(i, j);
+                    graph[i].insert(j, dist);
+                    graph[j].insert(i, dist);
+                }
+            });
+        }
+        InterruptCallback::check();
     }
 }
 
@@ -195,8 +200,9 @@ void NNDescent::update() {
         int l = 0;
 
         while ((l < maxl) && (c < S)) {
-            if (nn.pool[l].flag)
+            if (nn.pool[l].flag) {
                 ++c;
+            }
             ++l;
         }
         nn.M = l;
@@ -305,8 +311,9 @@ void NNDescent::generate_eval_set(
     for (int i = 0; i < c.size(); i++) {
         std::vector<Neighbor> tmp;
         for (int j = 0; j < N; j++) {
-            if (c[i] == j)
+            if (c[i] == j) {
                 continue; // skip itself
+            }
             float dist = qdis.symmetric_dis(c[i], j);
             tmp.push_back(Neighbor(j, dist, true));
         }
@@ -360,8 +367,9 @@ void NNDescent::init_graph(DistanceComputer& qdis) {
 
             for (int j = 0; j < S; j++) {
                 int id = tmp[j];
-                if (id == i)
+                if (id == i) {
                     continue;
+                }
                 float dist = qdis.symmetric_dis(i, id);
 
                 graph[i].pool.push_back(Neighbor(id, dist, true));
@@ -418,30 +426,30 @@ void NNDescent::search(
         float* dists,
         VisitedTable& vt) const {
     FAISS_THROW_IF_NOT_MSG(has_built, "The index is not build yet.");
-    int L = std::max(search_L, topk);
+    int L_2 = std::max(search_L, topk);
 
     // candidate pool, the K best items is the result.
-    std::vector<Neighbor> retset(L + 1);
+    std::vector<Neighbor> retset(L_2 + 1);
 
-    // Randomly choose L points to initialize the candidate pool
-    std::vector<int> init_ids(L);
+    // Randomly choose L_2 points to initialize the candidate pool
+    std::vector<int> init_ids(L_2);
     std::mt19937 rng(random_seed);
 
-    gen_random(rng, init_ids.data(), L, ntotal);
-    for (int i = 0; i < L; i++) {
+    gen_random(rng, init_ids.data(), L_2, ntotal);
+    for (int i = 0; i < L_2; i++) {
         int id = init_ids[i];
         float dist = qdis(id);
         retset[i] = Neighbor(id, dist, true);
     }
 
     // Maintain the candidate pool in ascending order
-    std::sort(retset.begin(), retset.begin() + L);
+    std::sort(retset.begin(), retset.begin() + L_2);
 
     int k = 0;
 
-    // Stop until the smallest position updated is >= L
-    while (k < L) {
-        int nk = L;
+    // Stop until the smallest position updated is >= L_2
+    while (k < L_2) {
+        int nk = L_2;
 
         if (retset[k].flag) {
             retset[k].flag = false;
@@ -449,25 +457,28 @@ void NNDescent::search(
 
             for (int m = 0; m < K; ++m) {
                 int id = final_graph[n * K + m];
-                if (vt.get(id))
+                if (vt.get(id)) {
                     continue;
+                }
 
                 vt.set(id);
                 float dist = qdis(id);
-                if (dist >= retset[L - 1].distance)
+                if (dist >= retset[L_2 - 1].distance) {
                     continue;
+                }
 
                 Neighbor nn(id, dist, true);
-                int r = insert_into_pool(retset.data(), L, nn);
+                int r = insert_into_pool(retset.data(), L_2, nn);
 
                 if (r < nk)
                     nk = r;
             }
         }
-        if (nk <= k)
+        if (nk <= k) {
             k = nk;
-        else
+        } else {
             ++k;
+        }
     }
     for (size_t i = 0; i < topk; i++) {
         indices[i] = retset[i].id;
diff --git a/faiss/impl/NSG.cpp b/faiss/impl/NSG.cpp
index 1f30b576b9..c974943343 100644
--- a/faiss/impl/NSG.cpp
+++ b/faiss/impl/NSG.cpp
@@ -25,35 +25,6 @@ namespace {
 // It needs to be smaller than 0
 constexpr int EMPTY_ID = -1;
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
diff --git a/faiss/impl/PolysemousTraining.cpp b/faiss/impl/PolysemousTraining.cpp
index 1f01fc9dcf..a3bd400fb6 100644
--- a/faiss/impl/PolysemousTraining.cpp
+++ b/faiss/impl/PolysemousTraining.cpp
@@ -683,18 +683,21 @@ struct RankingScore2 : Score3Computer<float, double> {
     double accum_gt_weight_diff(
             const std::vector<int>& a,
             const std::vector<int>& b) {
-        int nb = b.size(), na = a.size();
+        const auto nb_2 = b.size();
+        const auto na = a.size();
 
         double accu = 0;
-        int j = 0;
-        for (int i = 0; i < na; i++) {
-            int ai = a[i];
-            while (j < nb && ai >= b[j])
+        size_t j = 0;
+        for (size_t i = 0; i < na; i++) {
+            const auto ai = a[i];
+            while (j < nb_2 && ai >= b[j]) {
                 j++;
+            }
 
             double accu_i = 0;
-            for (int k = j; k < b.size(); k++)
+            for (auto k = j; k < b.size(); k++) {
                 accu_i += rank_weight(b[k] - ai);
+            }
 
             accu += rank_weight(ai) * accu_i;
         }
diff --git a/faiss/impl/ProductAdditiveQuantizer.h b/faiss/impl/ProductAdditiveQuantizer.h
index 163d341cf2..a161180b16 100644
--- a/faiss/impl/ProductAdditiveQuantizer.h
+++ b/faiss/impl/ProductAdditiveQuantizer.h
@@ -151,4 +151,4 @@ struct ProductResidualQuantizer : ProductAdditiveQuantizer {
     ProductResidualQuantizer();
 };
 
-}; // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/impl/ProductQuantizer.cpp b/faiss/impl/ProductQuantizer.cpp
index 8ae033ca8f..afcb7cbb9d 100644
--- a/faiss/impl/ProductQuantizer.cpp
+++ b/faiss/impl/ProductQuantizer.cpp
@@ -780,10 +780,6 @@ void ProductQuantizer::search_ip(
             init_finalize_heap);
 }
 
-static float sqr(float x) {
-    return x * x;
-}
-
 void ProductQuantizer::compute_sdc_table() {
     sdc_table.resize(M * ksub * ksub);
 
diff --git a/faiss/impl/Quantizer.h b/faiss/impl/Quantizer.h
index 34673211d7..9171448ef5 100644
--- a/faiss/impl/Quantizer.h
+++ b/faiss/impl/Quantizer.h
@@ -11,7 +11,7 @@
 
 namespace faiss {
 
-/** Product Quantizer. Implemented only for METRIC_L2 */
+/** General interface for quantizer objects */
 struct Quantizer {
     size_t d;         ///< size of the input vectors
     size_t code_size; ///< bytes per indexed vector
diff --git a/faiss/impl/ResultHandler.h b/faiss/impl/ResultHandler.h
index 53ed520826..713fe8e49f 100644
--- a/faiss/impl/ResultHandler.h
+++ b/faiss/impl/ResultHandler.h
@@ -12,28 +12,177 @@
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/partitioning.h>
+#include <iostream>
 
 namespace faiss {
 
+/*****************************************************************
+ * The classes below are intended to be used as template arguments
+ * they handle results for batches of queries (size nq).
+ * They can be called in two ways:
+ * - by instanciating a SingleResultHandler that tracks results for a single
+ *   query
+ * - with begin_multiple/add_results/end_multiple calls where a whole block of
+ *   resutls is submitted
+ * All classes are templated on C which to define wheter the min or the max of
+ * results is to be kept.
+ *****************************************************************/
+
+template <class C>
+struct BlockResultHandler {
+    size_t nq; // number of queries for which we search
+
+    explicit BlockResultHandler(size_t nq) : nq(nq) {}
+
+    // currently handled query range
+    size_t i0 = 0, i1 = 0;
+
+    // start collecting results for queries [i0, i1)
+    virtual void begin_multiple(size_t i0_2, size_t i1_2) {
+        this->i0 = i0_2;
+        this->i1 = i1_2;
+    }
+
+    // add results for queries [i0, i1) and database [j0, j1)
+    virtual void add_results(size_t, size_t, const typename C::T*) {}
+
+    // series of results for queries i0..i1 is done
+    virtual void end_multiple() {}
+
+    virtual ~BlockResultHandler() {}
+};
+
+// handler for a single query
+template <class C>
+struct ResultHandler {
+    // if not better than threshold, then not necessary to call add_result
+    typename C::T threshold = 0;
+
+    // return whether threshold was updated
+    virtual bool add_result(typename C::T dis, typename C::TI idx) = 0;
+
+    virtual ~ResultHandler() {}
+};
+
+/*****************************************************************
+ * Single best result handler.
+ * Tracks the only best result, thus avoiding storing
+ * some temporary data in memory.
+ *****************************************************************/
+
+template <class C>
+struct Top1BlockResultHandler : BlockResultHandler<C> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using BlockResultHandler<C>::i0;
+    using BlockResultHandler<C>::i1;
+
+    // contains exactly nq elements
+    T* dis_tab;
+    // contains exactly nq elements
+    TI* ids_tab;
+
+    Top1BlockResultHandler(size_t nq, T* dis_tab, TI* ids_tab)
+            : BlockResultHandler<C>(nq), dis_tab(dis_tab), ids_tab(ids_tab) {}
+
+    struct SingleResultHandler : ResultHandler<C> {
+        Top1BlockResultHandler& hr;
+        using ResultHandler<C>::threshold;
+
+        TI min_idx;
+        size_t current_idx = 0;
+
+        explicit SingleResultHandler(Top1BlockResultHandler& hr) : hr(hr) {}
+
+        /// begin results for query # i
+        void begin(const size_t current_idx_2) {
+            this->current_idx = current_idx_2;
+            threshold = C::neutral();
+            min_idx = -1;
+        }
+
+        /// add one result for query i
+        bool add_result(T dis, TI idx) final {
+            if (C::cmp(this->threshold, dis)) {
+                threshold = dis;
+                min_idx = idx;
+                return true;
+            }
+            return false;
+        }
+
+        /// series of results for query i is done
+        void end() {
+            hr.dis_tab[current_idx] = threshold;
+            hr.ids_tab[current_idx] = min_idx;
+        }
+    };
+
+    /// begin
+    void begin_multiple(size_t i0, size_t i1) final {
+        this->i0 = i0;
+        this->i1 = i1;
+
+        for (size_t i = i0; i < i1; i++) {
+            this->dis_tab[i] = C::neutral();
+        }
+    }
+
+    /// add results for query i0..i1 and j0..j1
+    void add_results(size_t j0, size_t j1, const T* dis_tab_2) final {
+        for (int64_t i = i0; i < i1; i++) {
+            const T* dis_tab_i = dis_tab_2 + (j1 - j0) * (i - i0) - j0;
+
+            auto& min_distance = this->dis_tab[i];
+            auto& min_index = this->ids_tab[i];
+
+            for (size_t j = j0; j < j1; j++) {
+                const T distance = dis_tab_i[j];
+
+                if (C::cmp(min_distance, distance)) {
+                    min_distance = distance;
+                    min_index = j;
+                }
+            }
+        }
+    }
+
+    void add_result(const size_t i, const T dis, const TI idx) {
+        auto& min_distance = this->dis_tab[i];
+        auto& min_index = this->ids_tab[i];
+
+        if (C::cmp(min_distance, dis)) {
+            min_distance = dis;
+            min_index = idx;
+        }
+    }
+};
+
 /*****************************************************************
  * Heap based result handler
  *****************************************************************/
 
 template <class C>
-struct HeapResultHandler {
+struct HeapBlockResultHandler : BlockResultHandler<C> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using BlockResultHandler<C>::i0;
+    using BlockResultHandler<C>::i1;
 
-    int nq;
     T* heap_dis_tab;
     TI* heap_ids_tab;
 
     int64_t k; // number of results to keep
 
-    HeapResultHandler(size_t nq, T* heap_dis_tab, TI* heap_ids_tab, size_t k)
-            : nq(nq),
+    HeapBlockResultHandler(
+            size_t nq,
+            T* heap_dis_tab,
+            TI* heap_ids_tab,
+            size_t k)
+            : BlockResultHandler<C>(nq),
               heap_dis_tab(heap_dis_tab),
               heap_ids_tab(heap_ids_tab),
               k(k) {}
@@ -43,30 +192,33 @@ struct HeapResultHandler {
      * called from 1 thread)
      */
 
-    struct SingleResultHandler {
-        HeapResultHandler& hr;
+    struct SingleResultHandler : ResultHandler<C> {
+        HeapBlockResultHandler& hr;
+        using ResultHandler<C>::threshold;
         size_t k;
 
         T* heap_dis;
         TI* heap_ids;
-        T thresh;
 
-        SingleResultHandler(HeapResultHandler& hr) : hr(hr), k(hr.k) {}
+        explicit SingleResultHandler(HeapBlockResultHandler& hr)
+                : hr(hr), k(hr.k) {}
 
         /// begin results for query # i
         void begin(size_t i) {
             heap_dis = hr.heap_dis_tab + i * k;
             heap_ids = hr.heap_ids_tab + i * k;
             heap_heapify<C>(k, heap_dis, heap_ids);
-            thresh = heap_dis[0];
+            threshold = heap_dis[0];
         }
 
         /// add one result for query i
-        void add_result(T dis, TI idx) {
-            if (C::cmp(heap_dis[0], dis)) {
+        bool add_result(T dis, TI idx) final {
+            if (C::cmp(threshold, dis)) {
                 heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
-                thresh = heap_dis[0];
+                threshold = heap_dis[0];
+                return true;
             }
+            return false;
         }
 
         /// series of results for query i is done
@@ -79,19 +231,17 @@ struct HeapResultHandler {
      * API for multiple results (called from 1 thread)
      */
 
-    size_t i0, i1;
-
     /// begin
-    void begin_multiple(size_t i0, size_t i1) {
-        this->i0 = i0;
-        this->i1 = i1;
+    void begin_multiple(size_t i0_2, size_t i1_2) final {
+        this->i0 = i0_2;
+        this->i1 = i1_2;
         for (size_t i = i0; i < i1; i++) {
             heap_heapify<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
         }
     }
 
     /// add results for query i0..i1 and j0..j1
-    void add_results(size_t j0, size_t j1, const T* dis_tab) {
+    void add_results(size_t j0, size_t j1, const T* dis_tab) final {
 #pragma omp parallel for
         for (int64_t i = i0; i < i1; i++) {
             T* heap_dis = heap_dis_tab + i * k;
@@ -109,7 +259,7 @@ struct HeapResultHandler {
     }
 
     /// series of results for queries i0..i1 is done
-    void end_multiple() {
+    void end_multiple() final {
         // maybe parallel for
         for (size_t i = i0; i < i1; i++) {
             heap_reorder<C>(k, heap_dis_tab + i * k, heap_ids_tab + i * k);
@@ -128,9 +278,10 @@ struct HeapResultHandler {
 
 /// Reservoir for a single query
 template <class C>
-struct ReservoirTopN {
+struct ReservoirTopN : ResultHandler<C> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using ResultHandler<C>::threshold;
 
     T* vals;
     TI* ids;
@@ -139,8 +290,6 @@ struct ReservoirTopN {
     size_t n;        // number of requested elements
     size_t capacity; // size of storage
 
-    T threshold; // current threshold
-
     ReservoirTopN() {}
 
     ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
@@ -149,15 +298,22 @@ struct ReservoirTopN {
         threshold = C::neutral();
     }
 
-    void add(T val, TI id) {
+    bool add_result(T val, TI id) final {
+        bool updated_threshold = false;
         if (C::cmp(threshold, val)) {
             if (i == capacity) {
                 shrink_fuzzy();
+                updated_threshold = true;
             }
             vals[i] = val;
             ids[i] = id;
             i++;
         }
+        return updated_threshold;
+    }
+
+    void add(T val, TI id) {
+        add_result(val, id);
     }
 
     // reduce storage from capacity to anything
@@ -169,6 +325,11 @@ struct ReservoirTopN {
                 vals, ids, capacity, n, (capacity + n) / 2, &i);
     }
 
+    void shrink() {
+        threshold = partition<C>(vals, ids, i, n);
+        i = n;
+    }
+
     void to_result(T* heap_dis, TI* heap_ids) const {
         for (int j = 0; j < std::min(i, n); j++) {
             heap_push<C>(j + 1, heap_dis, heap_ids, vals[j], ids[j]);
@@ -187,23 +348,24 @@ struct ReservoirTopN {
 };
 
 template <class C>
-struct ReservoirResultHandler {
+struct ReservoirBlockResultHandler : BlockResultHandler<C> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using BlockResultHandler<C>::i0;
+    using BlockResultHandler<C>::i1;
 
-    int nq;
     T* heap_dis_tab;
     TI* heap_ids_tab;
 
     int64_t k;       // number of results to keep
     size_t capacity; // capacity of the reservoirs
 
-    ReservoirResultHandler(
+    ReservoirBlockResultHandler(
             size_t nq,
             T* heap_dis_tab,
             TI* heap_ids_tab,
             size_t k)
-            : nq(nq),
+            : BlockResultHandler<C>(nq),
               heap_dis_tab(heap_dis_tab),
               heap_ids_tab(heap_ids_tab),
               k(k) {
@@ -216,40 +378,34 @@ struct ReservoirResultHandler {
      * called from 1 thread)
      */
 
-    struct SingleResultHandler {
-        ReservoirResultHandler& hr;
+    struct SingleResultHandler : ReservoirTopN<C> {
+        ReservoirBlockResultHandler& hr;
 
         std::vector<T> reservoir_dis;
         std::vector<TI> reservoir_ids;
-        ReservoirTopN<C> res1;
 
-        SingleResultHandler(ReservoirResultHandler& hr)
-                : hr(hr),
-                  reservoir_dis(hr.capacity),
-                  reservoir_ids(hr.capacity) {}
+        explicit SingleResultHandler(ReservoirBlockResultHandler& hr)
+                : ReservoirTopN<C>(hr.k, hr.capacity, nullptr, nullptr),
+                  hr(hr) {}
 
-        size_t i;
+        size_t qno;
 
         /// begin results for query # i
-        void begin(size_t i) {
-            res1 = ReservoirTopN<C>(
-                    hr.k,
-                    hr.capacity,
-                    reservoir_dis.data(),
-                    reservoir_ids.data());
-            this->i = i;
+        void begin(size_t qno_2) {
+            reservoir_dis.resize(hr.capacity);
+            reservoir_ids.resize(hr.capacity);
+            this->vals = reservoir_dis.data();
+            this->ids = reservoir_ids.data();
+            this->i = 0; // size of reservoir
+            this->threshold = C::neutral();
+            this->qno = qno_2;
         }
 
-        /// add one result for query i
-        void add_result(T dis, TI idx) {
-            res1.add(dis, idx);
-        }
-
-        /// series of results for query i is done
+        /// series of results for query qno is done
         void end() {
-            T* heap_dis = hr.heap_dis_tab + i * hr.k;
-            TI* heap_ids = hr.heap_ids_tab + i * hr.k;
-            res1.to_result(heap_dis, heap_ids);
+            T* heap_dis = hr.heap_dis_tab + qno * hr.k;
+            TI* heap_ids = hr.heap_ids_tab + qno * hr.k;
+            this->to_result(heap_dis, heap_ids);
         }
     };
 
@@ -257,44 +413,41 @@ struct ReservoirResultHandler {
      * API for multiple results (called from 1 thread)
      */
 
-    size_t i0, i1;
-
     std::vector<T> reservoir_dis;
     std::vector<TI> reservoir_ids;
     std::vector<ReservoirTopN<C>> reservoirs;
 
     /// begin
-    void begin_multiple(size_t i0, size_t i1) {
-        this->i0 = i0;
-        this->i1 = i1;
+    void begin_multiple(size_t i0_2, size_t i1_2) {
+        this->i0 = i0_2;
+        this->i1 = i1_2;
         reservoir_dis.resize((i1 - i0) * capacity);
         reservoir_ids.resize((i1 - i0) * capacity);
         reservoirs.clear();
-        for (size_t i = i0; i < i1; i++) {
+        for (size_t i = i0_2; i < i1_2; i++) {
             reservoirs.emplace_back(
                     k,
                     capacity,
-                    reservoir_dis.data() + (i - i0) * capacity,
-                    reservoir_ids.data() + (i - i0) * capacity);
+                    reservoir_dis.data() + (i - i0_2) * capacity,
+                    reservoir_ids.data() + (i - i0_2) * capacity);
         }
     }
 
     /// add results for query i0..i1 and j0..j1
     void add_results(size_t j0, size_t j1, const T* dis_tab) {
-        // maybe parallel for
 #pragma omp parallel for
         for (int64_t i = i0; i < i1; i++) {
             ReservoirTopN<C>& reservoir = reservoirs[i - i0];
             const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
             for (size_t j = j0; j < j1; j++) {
                 T dis = dis_tab_i[j];
-                reservoir.add(dis, j);
+                reservoir.add_result(dis, j);
             }
         }
     }
 
     /// series of results for queries i0..i1 is done
-    void end_multiple() {
+    void end_multiple() final {
         // maybe parallel for
         for (size_t i = i0; i < i1; i++) {
             reservoirs[i - i0].to_result(
@@ -308,29 +461,33 @@ struct ReservoirResultHandler {
  *****************************************************************/
 
 template <class C>
-struct RangeSearchResultHandler {
+struct RangeSearchBlockResultHandler : BlockResultHandler<C> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using BlockResultHandler<C>::i0;
+    using BlockResultHandler<C>::i1;
 
     RangeSearchResult* res;
-    float radius;
+    T radius;
 
-    RangeSearchResultHandler(RangeSearchResult* res, float radius)
-            : res(res), radius(radius) {}
+    RangeSearchBlockResultHandler(RangeSearchResult* res, float radius)
+            : BlockResultHandler<C>(res->nq), res(res), radius(radius) {}
 
     /******************************************************
      * API for 1 result at a time (each SingleResultHandler is
      * called from 1 thread)
      ******************************************************/
 
-    struct SingleResultHandler {
+    struct SingleResultHandler : ResultHandler<C> {
         // almost the same interface as RangeSearchResultHandler
+        using ResultHandler<C>::threshold;
         RangeSearchPartialResult pres;
-        float radius;
         RangeQueryResult* qr = nullptr;
 
-        SingleResultHandler(RangeSearchResultHandler& rh)
-                : pres(rh.res), radius(rh.radius) {}
+        explicit SingleResultHandler(RangeSearchBlockResultHandler& rh)
+                : pres(rh.res) {
+            threshold = rh.radius;
+        }
 
         /// begin results for query # i
         void begin(size_t i) {
@@ -338,17 +495,26 @@ struct RangeSearchResultHandler {
         }
 
         /// add one result for query i
-        void add_result(T dis, TI idx) {
-            if (C::cmp(radius, dis)) {
+        bool add_result(T dis, TI idx) final {
+            if (C::cmp(threshold, dis)) {
                 qr->add(dis, idx);
             }
+            return false;
         }
 
         /// series of results for query i is done
         void end() {}
 
         ~SingleResultHandler() {
-            pres.finalize();
+            try {
+                // finalize the partial result
+                pres.finalize();
+            } catch (const faiss::FaissException& e) {
+                // Do nothing if allocation fails in finalizing partial results.
+#ifndef NDEBUG
+                std::cerr << e.what() << std::endl;
+#endif
+            }
         }
     };
 
@@ -356,16 +522,14 @@ struct RangeSearchResultHandler {
      * API for multiple results (called from 1 thread)
      ******************************************************/
 
-    size_t i0, i1;
-
     std::vector<RangeSearchPartialResult*> partial_results;
     std::vector<size_t> j0s;
     int pr = 0;
 
     /// begin
-    void begin_multiple(size_t i0, size_t i1) {
-        this->i0 = i0;
-        this->i1 = i1;
+    void begin_multiple(size_t i0_2, size_t i1_2) {
+        this->i0 = i0_2;
+        this->i1 = i1_2;
     }
 
     /// add results for query i0..i1 and j0..j1
@@ -404,109 +568,18 @@ struct RangeSearchResultHandler {
         }
     }
 
-    void end_multiple() {}
-
-    ~RangeSearchResultHandler() {
-        if (partial_results.size() > 0) {
-            RangeSearchPartialResult::merge(partial_results);
-        }
-    }
-};
-
-/*****************************************************************
- * Single best result handler.
- * Tracks the only best result, thus avoiding storing
- * some temporary data in memory.
- *****************************************************************/
-
-template <class C>
-struct SingleBestResultHandler {
-    using T = typename C::T;
-    using TI = typename C::TI;
-
-    int nq;
-    // contains exactly nq elements
-    T* dis_tab;
-    // contains exactly nq elements
-    TI* ids_tab;
-
-    SingleBestResultHandler(size_t nq, T* dis_tab, TI* ids_tab)
-            : nq(nq), dis_tab(dis_tab), ids_tab(ids_tab) {}
-
-    struct SingleResultHandler {
-        SingleBestResultHandler& hr;
-
-        T min_dis;
-        TI min_idx;
-        size_t current_idx = 0;
-
-        SingleResultHandler(SingleBestResultHandler& hr) : hr(hr) {}
-
-        /// begin results for query # i
-        void begin(const size_t current_idx) {
-            this->current_idx = current_idx;
-            min_dis = HUGE_VALF;
-            min_idx = -1;
-        }
-
-        /// add one result for query i
-        void add_result(T dis, TI idx) {
-            if (C::cmp(min_dis, dis)) {
-                min_dis = dis;
-                min_idx = idx;
-            }
-        }
-
-        /// series of results for query i is done
-        void end() {
-            hr.dis_tab[current_idx] = min_dis;
-            hr.ids_tab[current_idx] = min_idx;
-        }
-    };
-
-    size_t i0, i1;
-
-    /// begin
-    void begin_multiple(size_t i0, size_t i1) {
-        this->i0 = i0;
-        this->i1 = i1;
-
-        for (size_t i = i0; i < i1; i++) {
-            this->dis_tab[i] = HUGE_VALF;
-        }
-    }
-
-    /// add results for query i0..i1 and j0..j1
-    void add_results(size_t j0, size_t j1, const T* dis_tab) {
-        for (int64_t i = i0; i < i1; i++) {
-            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
-
-            auto& min_distance = this->dis_tab[i];
-            auto& min_index = this->ids_tab[i];
-
-            for (size_t j = j0; j < j1; j++) {
-                const T distance = dis_tab_i[j];
-
-                if (C::cmp(min_distance, distance)) {
-                    min_distance = distance;
-                    min_index = j;
-                }
+    ~RangeSearchBlockResultHandler() {
+        try {
+            if (partial_results.size() > 0) {
+                RangeSearchPartialResult::merge(partial_results);
             }
+        } catch (const faiss::FaissException& e) {
+            // Do nothing if allocation fails in merge.
+#ifndef NDEBUG
+            std::cerr << e.what() << std::endl;
+#endif
         }
     }
-
-    void add_result(const size_t i, const T dis, const TI idx) {
-        auto& min_distance = this->dis_tab[i];
-        auto& min_index = this->ids_tab[i];
-
-        if (C::cmp(min_distance, dis)) {
-            min_distance = dis;
-            min_index = idx;
-        }
-    }
-
-    /// series of results for queries i0..i1 is done
-    void end_multiple() {}
 };
 
 } // namespace faiss
diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index b6fc6183d9..7ad50189e4 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -23,6 +23,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/utils/bf16.h>
 #include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
@@ -91,6 +92,19 @@ struct Codec8bit {
         return _mm256_fmadd_ps(f8, one_255, half_one_255);
     }
 #endif
+
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return {res1, res2};
+    }
+#endif
 };
 
 struct Codec4bit {
@@ -129,6 +143,19 @@ struct Codec4bit {
         return _mm256_mul_ps(f8, one_255);
     }
 #endif
+
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return {res1, res2};
+    }
+#endif
 };
 
 struct Codec6bit {
@@ -228,6 +255,19 @@ struct Codec6bit {
     }
 
 #endif
+
+#ifdef __aarch64__
+    static FAISS_ALWAYS_INLINE float32x4x2_t
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return {res1, res2};
+    }
+#endif
 };
 
 /*******************************************************************
@@ -293,6 +333,29 @@ struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
 
 #endif
 
+#ifdef __aarch64__
+
+template <class Codec>
+struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
+    QuantizerTemplate(size_t d, const std::vector<float>& trained)
+            : QuantizerTemplate<Codec, true, 1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i);
+        return {vfmaq_f32(
+                        vdupq_n_f32(this->vmin),
+                        xi.val[0],
+                        vdupq_n_f32(this->vdiff)),
+                vfmaq_f32(
+                        vdupq_n_f32(this->vmin),
+                        xi.val[1],
+                        vdupq_n_f32(this->vdiff))};
+    }
+};
+
+#endif
+
 template <class Codec>
 struct QuantizerTemplate<Codec, false, 1> : ScalarQuantizer::SQuantizer {
     const size_t d;
@@ -350,6 +413,27 @@ struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
 
 #endif
 
+#ifdef __aarch64__
+
+template <class Codec>
+struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
+    QuantizerTemplate(size_t d, const std::vector<float>& trained)
+            : QuantizerTemplate<Codec, false, 1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32x4x2_t xi = Codec::decode_8_components(code, i);
+
+        float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
+        float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
+
+        return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
+                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])};
+    }
+};
+
+#endif
+
 /*******************************************************************
  * FP16 quantizer
  *******************************************************************/
@@ -397,6 +481,88 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
 
 #endif
 
+#ifdef __aarch64__
+
+template <>
+struct QuantizerFP16<8> : QuantizerFP16<1> {
+    QuantizerFP16(size_t d, const std::vector<float>& trained)
+            : QuantizerFP16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
+                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))};
+    }
+};
+#endif
+
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16 {};
+
+template <>
+struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_bf16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_bf16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_bf16(((uint16_t*)code)[i]);
+    }
+};
+
+#ifdef __AVX2__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
+        code_256i = _mm256_slli_epi32(code_256i, 16);
+        return _mm256_castsi256_ps(code_256i);
+    }
+};
+
+#endif
+
+#ifdef __aarch64__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
+                vreinterpretq_f32_u32(
+                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
+    }
+};
+#endif
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -446,6 +612,27 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
 
 #endif
 
+#ifdef __aarch64__
+
+template <>
+struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
+    Quantizer8bitDirect(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirect<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = code[i + j];
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return {res1, res2};
+    }
+};
+
+#endif
+
 template <int SIMDWIDTH>
 ScalarQuantizer::SQuantizer* select_quantizer_1(
         QuantizerType qtype,
@@ -469,6 +656,8 @@ ScalarQuantizer::SQuantizer* select_quantizer_1(
                     d, trained);
         case ScalarQuantizer::QT_fp16:
             return new QuantizerFP16<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_bf16:
+            return new QuantizerBF16<SIMDWIDTH>(d, trained);
         case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
     }
@@ -710,8 +899,8 @@ struct SimilarityL2<8> {
         accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
     }
 
-    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y) {
-        __m256 tmp = _mm256_sub_ps(y, x);
+    FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y_2) {
+        __m256 tmp = _mm256_sub_ps(y_2, x);
         accu8 = _mm256_fmadd_ps(tmp, tmp, accu8);
     }
 
@@ -728,6 +917,57 @@ struct SimilarityL2<8> {
 
 #endif
 
+#ifdef __aarch64__
+template <>
+struct SimilarityL2<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_L2;
+
+    const float *y, *yi;
+    explicit SimilarityL2(const float* y) : y(y) {}
+    float32x4x2_t accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+
+        float32x4_t sub0 = vsubq_f32(yiv.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(yiv.val[1], x.val[1]);
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            float32x4x2_t x,
+            float32x4x2_t y) {
+        float32x4_t sub0 = vsubq_f32(y.val[0], x.val[0]);
+        float32x4_t sub1 = vsubq_f32(y.val[1], x.val[1]);
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
+
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4_t sum_0 = vpaddq_f32(accu8.val[0], accu8.val[0]);
+        float32x4_t sum_1 = vpaddq_f32(accu8.val[1], accu8.val[1]);
+
+        float32x4_t sum2_0 = vpaddq_f32(sum_0, sum_0);
+        float32x4_t sum2_1 = vpaddq_f32(sum_1, sum_1);
+        return vgetq_lane_f32(sum2_0, 0) + vgetq_lane_f32(sum2_1, 0);
+    }
+};
+#endif
+
 template <int SIMDWIDTH>
 struct SimilarityIP {};
 
@@ -801,6 +1041,53 @@ struct SimilarityIP<8> {
 };
 #endif
 
+#ifdef __aarch64__
+
+template <>
+struct SimilarityIP<8> {
+    static constexpr int simdwidth = 8;
+    static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
+
+    const float *y, *yi;
+
+    explicit SimilarityIP(const float* y) : y(y) {}
+    float32x4x2_t accu8;
+
+    FAISS_ALWAYS_INLINE void begin_8() {
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
+        yi = y;
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components(float32x4x2_t x) {
+        float32x4x2_t yiv = vld1q_f32_x2(yi);
+        yi += 8;
+
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE void add_8_components_2(
+            float32x4x2_t x1,
+            float32x4x2_t x2) {
+        float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
+        float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
+        accu8 = {accu8_0, accu8_1};
+    }
+
+    FAISS_ALWAYS_INLINE float result_8() {
+        float32x4x2_t sum = {
+                vpaddq_f32(accu8.val[0], accu8.val[0]),
+                vpaddq_f32(accu8.val[1], accu8.val[1])};
+
+        float32x4x2_t sum2 = {
+                vpaddq_f32(sum.val[0], sum.val[0]),
+                vpaddq_f32(sum.val[1], sum.val[1])};
+        return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
+    }
+};
+#endif
+
 /*******************************************************************
  * DistanceComputer: combines a similarity and a quantizer to do
  * code-to-vector or code-to-code comparisons
@@ -903,6 +1190,53 @@ struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
 
 #endif
 
+#ifdef __aarch64__
+
+template <class Quantizer, class Similarity>
+struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    Quantizer quant;
+
+    DCTemplate(size_t d, const std::vector<float>& trained)
+            : quant(d, trained) {}
+    float compute_distance(const float* x, const uint8_t* code) const {
+        Similarity sim(x);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            float32x4x2_t xi = quant.reconstruct_8_components(code, i);
+            sim.add_8_components(xi);
+        }
+        return sim.result_8();
+    }
+
+    float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        Similarity sim(nullptr);
+        sim.begin_8();
+        for (size_t i = 0; i < quant.d; i += 8) {
+            float32x4x2_t x1 = quant.reconstruct_8_components(code1, i);
+            float32x4x2_t x2 = quant.reconstruct_8_components(code2, i);
+            sim.add_8_components_2(x1, x2);
+        }
+        return sim.result_8();
+    }
+
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_distance(q, code);
+    }
+};
+#endif
+
 /*******************************************************************
  * DistanceComputerByte: computes distances in the integer domain
  *******************************************************************/
@@ -1019,6 +1353,54 @@ struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
 
 #endif
 
+#ifdef __aarch64__
+
+template <class Similarity>
+struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
+    using Sim = Similarity;
+
+    int d;
+    std::vector<uint8_t> tmp;
+
+    DistanceComputerByte(int d, const std::vector<float>&) : d(d), tmp(d) {}
+
+    int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
+            const {
+        int accu = 0;
+        for (int i = 0; i < d; i++) {
+            if (Sim::metric_type == METRIC_INNER_PRODUCT) {
+                accu += int(code1[i]) * code2[i];
+            } else {
+                int diff = int(code1[i]) - code2[i];
+                accu += diff * diff;
+            }
+        }
+        return accu;
+    }
+
+    void set_query(const float* x) final {
+        for (int i = 0; i < d; i++) {
+            tmp[i] = int(x[i]);
+        }
+    }
+
+    int compute_distance(const float* x, const uint8_t* code) {
+        set_query(x);
+        return compute_code_distance(tmp.data(), code);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return compute_code_distance(
+                codes + i * code_size, codes + j * code_size);
+    }
+
+    float query_to_code(const uint8_t* code) const final {
+        return compute_code_distance(tmp.data(), code);
+    }
+};
+
+#endif
+
 /*******************************************************************
  * select_distance_computer: runtime selection of template
  * specialization
@@ -1065,6 +1447,10 @@ SQDistanceComputer* select_distance_computer(
             return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
                     d, trained);
 
+        case ScalarQuantizer::QT_bf16:
+            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
+                    d, trained);
+
         case ScalarQuantizer::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
@@ -1113,6 +1499,10 @@ void ScalarQuantizer::set_derived_sizes() {
             code_size = d * 2;
             bits = 16;
             break;
+        case QT_bf16:
+            code_size = d * 2;
+            bits = 16;
+            break;
     }
 }
 
@@ -1149,13 +1539,14 @@ void ScalarQuantizer::train(size_t n, const float* x) {
             break;
         case QT_fp16:
         case QT_8bit_direct:
+        case QT_bf16:
             // no training necessary
             break;
     }
 }
 
 ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const {
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         return select_quantizer_1<8>(qtype, d, trained);
     } else
@@ -1186,7 +1577,7 @@ void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
 SQDistanceComputer* ScalarQuantizer::get_distance_computer(
         MetricType metric) const {
     FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         if (metric == METRIC_L2) {
             return select_distance_computer<SimilarityL2<8>>(qtype, d, trained);
@@ -1478,6 +1869,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                     QuantizerFP16<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_bf16:
+            return sel2_InvertedListScanner<DCTemplate<
+                    QuantizerBF16<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case ScalarQuantizer::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner<
@@ -1522,7 +1918,7 @@ InvertedListScanner* ScalarQuantizer::select_InvertedListScanner(
         bool store_pairs,
         const IDSelector* sel,
         bool by_residual) const {
-#ifdef USE_F16C
+#if defined(USE_F16C) || defined(__aarch64__)
     if (d % 8 == 0) {
         return sel0_InvertedListScanner<8>(
                 mt, this, quantizer, store_pairs, sel, by_residual);
diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h
index 550a979092..49fd42cc31 100644
--- a/faiss/impl/ScalarQuantizer.h
+++ b/faiss/impl/ScalarQuantizer.h
@@ -32,6 +32,7 @@ struct ScalarQuantizer : Quantizer {
         QT_fp16,
         QT_8bit_direct, ///< fast indexing of uint8s
         QT_6bit,        ///< 6 bits per component
+        QT_bf16,
     };
 
     QuantizerType qtype = QT_8bit;
diff --git a/faiss/impl/code_distance/code_distance-avx2.h b/faiss/impl/code_distance/code_distance-avx2.h
index 0aa1535b28..d37b022441 100644
--- a/faiss/impl/code_distance/code_distance-avx2.h
+++ b/faiss/impl/code_distance/code_distance-avx2.h
@@ -16,6 +16,11 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/code_distance/code_distance-generic.h>
 
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+#if defined(__GNUC__) && __GNUC__ < 9
+#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
+#endif
+
 namespace {
 
 inline float horizontal_sum(const __m128 v) {
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 3253100369..aa041c0fac 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io_macros.h>
@@ -398,15 +396,12 @@ static void read_NSG(NSG* nsg, IOReader* f) {
     graph = std::make_shared<nsg::Graph<int>>(N, R);
     std::fill_n(graph->data, N * R, EMPTY_ID);
 
-    int size = 0;
-
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < R + 1; j++) {
             int id;
             READ1(id);
             if (id != EMPTY_ID) {
                 graph->at(i, j) = id;
-                size += 1;
             } else {
                 break;
             }
@@ -534,7 +529,11 @@ Index* read_index(IOReader* f, int io_flags) {
     Index* idx = nullptr;
     uint32_t h;
     READ1(h);
-    if (h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
+    if (h == fourcc("null")) {
+        // denotes a missing index, useful for some cases
+        return nullptr;
+    } else if (
+            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
         IndexFlat* idxf;
         if (h == fourcc("IxFI")) {
             idxf = new IndexFlatIP();
@@ -951,7 +950,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2")) {
+            h == fourcc("IHN2") || h == fourcc("IHNc")) {
         IndexHNSW* idxhnsw = nullptr;
         if (h == fourcc("IHNf"))
             idxhnsw = new IndexHNSWFlat();
@@ -961,11 +960,19 @@ Index* read_index(IOReader* f, int io_flags) {
             idxhnsw = new IndexHNSWSQ();
         if (h == fourcc("IHN2"))
             idxhnsw = new IndexHNSW2Level();
+        if (h == fourcc("IHNc"))
+            idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            READ1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
+            READ1(idx_hnsw_cagra->base_level_only);
+            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
-        idxhnsw->own_fields = true;
-        if (h == fourcc("IHNp")) {
+        idxhnsw->own_fields = idxhnsw->storage != nullptr;
+        if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
         idx = idxhnsw;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 84484e799c..0a924d0225 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io.h>
@@ -338,13 +336,11 @@ static void write_NSG(const NSG* nsg, IOWriter* f) {
     FAISS_THROW_IF_NOT(K == nsg->R);
     FAISS_THROW_IF_NOT(true == graph->own_fields);
 
-    int size = 0;
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < K; j++) {
             int id = graph->at(i, j);
             if (id != EMPTY_ID) {
                 WRITE1(id);
-                size += 1;
             } else {
                 break;
             }
@@ -392,8 +388,12 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_direct_map(&ivf->direct_map, f);
 }
 
-void write_index(const Index* idx, IOWriter* f) {
-    if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
+void write_index(const Index* idx, IOWriter* f, int io_flags) {
+    if (idx == nullptr) {
+        // eg. for a storage component of HNSW that is set to nullptr
+        uint32_t h = fourcc("null");
+        WRITE1(h);
+    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
         uint32_t h =
                 fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
                                : idxf->metric_type == METRIC_L2  ? "IxF2"
@@ -433,13 +433,14 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(idxr->code_size);
         WRITEVECTOR(idxr->codes);
     } else if (
-            auto* idxr = dynamic_cast<const IndexLocalSearchQuantizer*>(idx)) {
+            auto* idxr_2 =
+                    dynamic_cast<const IndexLocalSearchQuantizer*>(idx)) {
         uint32_t h = fourcc("IxLS");
         WRITE1(h);
         write_index_header(idx, f);
-        write_LocalSearchQuantizer(&idxr->lsq, f);
-        WRITE1(idxr->code_size);
-        WRITEVECTOR(idxr->codes);
+        write_LocalSearchQuantizer(&idxr_2->lsq, f);
+        WRITE1(idxr_2->code_size);
+        WRITEVECTOR(idxr_2->codes);
     } else if (
             const IndexProductResidualQuantizer* idxpr =
                     dynamic_cast<const IndexProductResidualQuantizer*>(idx)) {
@@ -572,26 +573,26 @@ void write_index(const Index* idx, IOWriter* f) {
 
         write_InvertedLists(ivaqfs->invlists, f);
     } else if (
-            const ResidualCoarseQuantizer* idxr =
+            const ResidualCoarseQuantizer* idxr_2 =
                     dynamic_cast<const ResidualCoarseQuantizer*>(idx)) {
         uint32_t h = fourcc("ImRQ");
         WRITE1(h);
         write_index_header(idx, f);
-        write_ResidualQuantizer(&idxr->rq, f);
-        WRITE1(idxr->beam_factor);
+        write_ResidualQuantizer(&idxr_2->rq, f);
+        WRITE1(idxr_2->beam_factor);
     } else if (
-            const Index2Layer* idxp = dynamic_cast<const Index2Layer*>(idx)) {
+            const Index2Layer* idxp_2 = dynamic_cast<const Index2Layer*>(idx)) {
         uint32_t h = fourcc("Ix2L");
         WRITE1(h);
         write_index_header(idx, f);
-        write_index(idxp->q1.quantizer, f);
-        WRITE1(idxp->q1.nlist);
-        WRITE1(idxp->q1.quantizer_trains_alone);
-        write_ProductQuantizer(&idxp->pq, f);
-        WRITE1(idxp->code_size_1);
-        WRITE1(idxp->code_size_2);
-        WRITE1(idxp->code_size);
-        WRITEVECTOR(idxp->codes);
+        write_index(idxp_2->q1.quantizer, f);
+        WRITE1(idxp_2->q1.nlist);
+        WRITE1(idxp_2->q1.quantizer_trains_alone);
+        write_ProductQuantizer(&idxp_2->pq, f);
+        WRITE1(idxp_2->code_size_1);
+        WRITE1(idxp_2->code_size_2);
+        WRITE1(idxp_2->code_size);
+        WRITEVECTOR(idxp_2->codes);
     } else if (
             const IndexScalarQuantizer* idxs =
                     dynamic_cast<const IndexScalarQuantizer*>(idx)) {
@@ -601,15 +602,16 @@ void write_index(const Index* idx, IOWriter* f) {
         write_ScalarQuantizer(&idxs->sq, f);
         WRITEVECTOR(idxs->codes);
     } else if (
-            const IndexLattice* idxl = dynamic_cast<const IndexLattice*>(idx)) {
+            const IndexLattice* idxl_2 =
+                    dynamic_cast<const IndexLattice*>(idx)) {
         uint32_t h = fourcc("IxLa");
         WRITE1(h);
-        WRITE1(idxl->d);
-        WRITE1(idxl->nsq);
-        WRITE1(idxl->scale_nbit);
-        WRITE1(idxl->zn_sphere_codec.r2);
+        WRITE1(idxl_2->d);
+        WRITE1(idxl_2->nsq);
+        WRITE1(idxl_2->scale_nbit);
+        WRITE1(idxl_2->zn_sphere_codec.r2);
         write_index_header(idx, f);
-        WRITEVECTOR(idxl->trained);
+        WRITEVECTOR(idxl_2->trained);
     } else if (
             const IndexIVFFlatDedup* ivfl =
                     dynamic_cast<const IndexIVFFlatDedup*>(idx)) {
@@ -628,11 +630,12 @@ void write_index(const Index* idx, IOWriter* f) {
         }
         write_InvertedLists(ivfl->invlists, f);
     } else if (
-            const IndexIVFFlat* ivfl = dynamic_cast<const IndexIVFFlat*>(idx)) {
+            const IndexIVFFlat* ivfl_2 =
+                    dynamic_cast<const IndexIVFFlat*>(idx)) {
         uint32_t h = fourcc("IwFl");
         WRITE1(h);
-        write_ivf_header(ivfl, f);
-        write_InvertedLists(ivfl->invlists, f);
+        write_ivf_header(ivfl_2, f);
+        write_InvertedLists(ivfl_2->invlists, f);
     } else if (
             const IndexIVFScalarQuantizer* ivsc =
                     dynamic_cast<const IndexIVFScalarQuantizer*>(idx)) {
@@ -759,12 +762,24 @@ void write_index(const Index* idx, IOWriter* f) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            WRITE1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
+            WRITE1(idx_hnsw_cagra->base_level_only);
+            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         write_HNSW(&idxhnsw->hnsw, f);
-        write_index(idxhnsw->storage, f);
+        if (io_flags & IO_FLAG_SKIP_STORAGE) {
+            uint32_t n4 = fourcc("null");
+            WRITE1(n4);
+        } else {
+            write_index(idxhnsw->storage, f);
+        }
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
         uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
                 : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
@@ -806,19 +821,19 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(idxpqfs->M2);
         WRITEVECTOR(idxpqfs->codes);
     } else if (
-            const IndexIVFPQFastScan* ivpq =
+            const IndexIVFPQFastScan* ivpq_2 =
                     dynamic_cast<const IndexIVFPQFastScan*>(idx)) {
         uint32_t h = fourcc("IwPf");
         WRITE1(h);
-        write_ivf_header(ivpq, f);
-        WRITE1(ivpq->by_residual);
-        WRITE1(ivpq->code_size);
-        WRITE1(ivpq->bbs);
-        WRITE1(ivpq->M2);
-        WRITE1(ivpq->implem);
-        WRITE1(ivpq->qbs2);
-        write_ProductQuantizer(&ivpq->pq, f);
-        write_InvertedLists(ivpq->invlists, f);
+        write_ivf_header(ivpq_2, f);
+        WRITE1(ivpq_2->by_residual);
+        WRITE1(ivpq_2->code_size);
+        WRITE1(ivpq_2->bbs);
+        WRITE1(ivpq_2->M2);
+        WRITE1(ivpq_2->implem);
+        WRITE1(ivpq_2->qbs2);
+        write_ProductQuantizer(&ivpq_2->pq, f);
+        write_InvertedLists(ivpq_2->invlists, f);
     } else if (
             const IndexRowwiseMinMax* imm =
                     dynamic_cast<const IndexRowwiseMinMax*>(idx)) {
@@ -828,26 +843,26 @@ void write_index(const Index* idx, IOWriter* f) {
         write_index_header(imm, f);
         write_index(imm->index, f);
     } else if (
-            const IndexRowwiseMinMaxFP16* imm =
+            const IndexRowwiseMinMaxFP16* imm_2 =
                     dynamic_cast<const IndexRowwiseMinMaxFP16*>(idx)) {
         // IndexRowwiseMinmaxHalf
         uint32_t h = fourcc("IRMh");
         WRITE1(h);
-        write_index_header(imm, f);
-        write_index(imm->index, f);
+        write_index_header(imm_2, f);
+        write_index(imm_2->index, f);
     } else {
         FAISS_THROW_MSG("don't know how to serialize this type of index");
     }
 }
 
-void write_index(const Index* idx, FILE* f) {
+void write_index(const Index* idx, FILE* f, int io_flags) {
     FileIOWriter writer(f);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
-void write_index(const Index* idx, const char* fname) {
+void write_index(const Index* idx, const char* fname, int io_flags) {
     FileIOWriter writer(fname);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
 void write_VectorTransform(const VectorTransform* vt, const char* fname) {
diff --git a/faiss/impl/io.cpp b/faiss/impl/io.cpp
index 3837525138..5f5b2d5ebd 100644
--- a/faiss/impl/io.cpp
+++ b/faiss/impl/io.cpp
@@ -20,11 +20,11 @@ namespace faiss {
  * IO functions
  ***********************************************************************/
 
-int IOReader::fileno() {
+int IOReader::filedescriptor() {
     FAISS_THROW_MSG("IOReader does not support memory mapping");
 }
 
-int IOWriter::fileno() {
+int IOWriter::filedescriptor() {
     FAISS_THROW_MSG("IOWriter does not support memory mapping");
 }
 
@@ -85,8 +85,12 @@ size_t FileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
     return fread(ptr, size, nitems, f);
 }
 
-int FileIOReader::fileno() {
+int FileIOReader::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 FileIOWriter::FileIOWriter(FILE* wf) : f(wf) {}
@@ -116,8 +120,12 @@ size_t FileIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
     return fwrite(ptr, size, nitems, f);
 }
 
-int FileIOWriter::fileno() {
+int FileIOWriter::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 /***********************************************************************
@@ -196,13 +204,13 @@ size_t BufferedIOWriter::operator()(
     while (size > 0) {
         assert(b0 == bsz);
         // now we need to flush to add more bytes
-        size_t ofs = 0;
+        size_t ofs_2 = 0;
         do {
-            assert(ofs < 10000000);
-            size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs);
+            assert(ofs_2 < 10000000);
+            size_t written = (*writer)(buffer.data() + ofs_2, 1, bsz - ofs_2);
             FAISS_THROW_IF_NOT(written > 0);
-            ofs += written;
-        } while (ofs != bsz);
+            ofs_2 += written;
+        } while (ofs_2 != bsz);
 
         // copy src to buffer
         size_t nb1 = std::min(bsz, size);
@@ -217,12 +225,12 @@ size_t BufferedIOWriter::operator()(
 }
 
 BufferedIOWriter::~BufferedIOWriter() {
-    size_t ofs = 0;
-    while (ofs != b0) {
-        // printf("Destructor write %zd \n", b0 - ofs);
-        size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs);
+    size_t ofs_2 = 0;
+    while (ofs_2 != b0) {
+        // printf("Destructor write %zd \n", b0 - ofs_2);
+        size_t written = (*writer)(buffer.data() + ofs_2, 1, b0 - ofs_2);
         FAISS_THROW_IF_NOT(written > 0);
-        ofs += written;
+        ofs_2 += written;
     }
 }
 
@@ -259,7 +267,7 @@ std::string fourcc_inv_printable(uint32_t x) {
             str += c;
         } else {
             char buf[10];
-            sprintf(buf, "\\x%02x", c);
+            snprintf(buf, sizeof(buf), "\\x%02x", c);
             str += buf;
         }
     }
diff --git a/faiss/impl/io.h b/faiss/impl/io.h
index 8d0605a5a6..59c2e31539 100644
--- a/faiss/impl/io.h
+++ b/faiss/impl/io.h
@@ -32,7 +32,7 @@ struct IOReader {
     virtual size_t operator()(void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOReader() {}
 };
@@ -45,7 +45,7 @@ struct IOWriter {
     virtual size_t operator()(const void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOWriter() noexcept(false) {}
 };
@@ -73,7 +73,7 @@ struct FileIOReader : IOReader {
 
     size_t operator()(void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 struct FileIOWriter : IOWriter {
@@ -88,7 +88,7 @@ struct FileIOWriter : IOWriter {
 
     size_t operator()(const void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 /*******************************************************
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index aeafb9531a..3fc328535b 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -40,11 +40,13 @@
 
 #include <intrin.h>
 
+#ifndef __clang__
 inline int __builtin_ctzll(uint64_t x) {
     unsigned long ret;
     _BitScanForward64(&ret, x);
     return (int)ret;
 }
+#endif
 
 // cudatoolkit provides __builtin_ctz for NVCC >= 11.0
 #if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
@@ -55,13 +57,20 @@ inline int __builtin_ctz(unsigned long x) {
 }
 #endif
 
+#ifndef __clang__
 inline int __builtin_clzll(uint64_t x) {
     return (int)__lzcnt64(x);
 }
+#endif
 
 #define __builtin_popcount __popcnt
 #define __builtin_popcountl __popcnt64
 
+#ifndef __clang__
+#define __m128i_u __m128i
+#define __m256i_u __m256i
+#endif
+
 // MSVC does not define __SSEx__, and _M_IX86_FP is only defined on 32-bit
 // processors cf.
 // https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
@@ -118,6 +127,13 @@ inline int __builtin_clzll(uint64_t x) {
     __pragma(float_control(precise, off, push))
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
 #elif defined(__clang__)
+#if defined(__PPC__)
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize_width(4) interleave_count(8)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+#else
 #define FAISS_PRAGMA_IMPRECISE_LOOP \
     _Pragma("clang loop vectorize(enable) interleave(enable)")
 
@@ -135,6 +151,7 @@ inline int __builtin_clzll(uint64_t x) {
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 #endif
+#endif
 #elif defined(__GNUC__)
 // Unfortunately, GCC does not provide a pragma for detecting it.
 // So, we have to stick to GNUC, which is defined by MANY compilers.
@@ -156,3 +173,17 @@ inline int __builtin_clzll(uint64_t x) {
 #endif
 
 // clang-format on
+
+/*******************************************************
+ * BIGENDIAN specific macros
+ *******************************************************/
+#if !defined(_MSC_VER) && \
+        (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+#define FAISS_BIG_ENDIAN
+#endif
+
+#define Swap2Bytes(val) ((((val) >> 8) & 0x00FF) | (((val) << 8) & 0xFF00))
+
+#define Swap4Bytes(val)                                           \
+    ((((val) >> 24) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
+     (((val) << 8) & 0x00FF0000) | (((val) << 24) & 0xFF000000))
diff --git a/faiss/impl/pq4_fast_scan.cpp b/faiss/impl/pq4_fast_scan.cpp
index d2cca15de3..127646e0eb 100644
--- a/faiss/impl/pq4_fast_scan.cpp
+++ b/faiss/impl/pq4_fast_scan.cpp
@@ -6,6 +6,7 @@
  */
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
 #include <faiss/impl/pq4_fast_scan.h>
 #include <faiss/impl/simd_result_handlers.h>
 
@@ -54,9 +55,17 @@ void pq4_pack_codes(
     FAISS_THROW_IF_NOT(nb % bbs == 0);
     FAISS_THROW_IF_NOT(nsq % 2 == 0);
 
+    if (nb == 0) {
+        return;
+    }
     memset(blocks, 0, nb * nsq / 2);
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     uint8_t* codes2 = blocks;
     for (size_t i0 = 0; i0 < nb; i0 += bbs) {
@@ -90,8 +99,13 @@ void pq4_pack_codes_range(
         size_t bbs,
         size_t nsq,
         uint8_t* blocks) {
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     // range of affected blocks
     size_t block0 = i0 / bbs;
diff --git a/faiss/impl/pq4_fast_scan.h b/faiss/impl/pq4_fast_scan.h
index 2e6931f8d3..9f95f76cc1 100644
--- a/faiss/impl/pq4_fast_scan.h
+++ b/faiss/impl/pq4_fast_scan.h
@@ -24,6 +24,9 @@
 
 namespace faiss {
 
+struct NormTableScaler;
+struct SIMDResultHandler;
+
 /** Pack codes for consumption by the SIMD kernels.
  *  The unused bytes are set to 0.
  *
@@ -117,7 +120,6 @@ void pq4_pack_LUT(int nq, int nsq, const uint8_t* src, uint8_t* dest);
  * @param LUT     packed look-up table
  * @param scaler  scaler to scale the encoded norm
  */
-template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop(
         int nq,
         size_t nb,
@@ -125,8 +127,8 @@ void pq4_accumulate_loop(
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler);
+        SIMDResultHandler& res,
+        const NormTableScaler* scaler);
 
 /* qbs versions, supported only for bbs=32.
  *
@@ -178,14 +180,13 @@ int pq4_pack_LUT_qbs_q_map(
  * @param res     call-back for the resutls
  * @param scaler  scaler to scale the encoded norm
  */
-template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop_qbs(
         int qbs,
         size_t nb,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res,
-        const Scaler& scaler);
+        SIMDResultHandler& res,
+        const NormTableScaler* scaler = nullptr);
 
 } // namespace faiss
diff --git a/faiss/impl/pq4_fast_scan_search_1.cpp b/faiss/impl/pq4_fast_scan_search_1.cpp
index 6197c2be78..ca41f287f2 100644
--- a/faiss/impl/pq4_fast_scan_search_1.cpp
+++ b/faiss/impl/pq4_fast_scan_search_1.cpp
@@ -134,10 +134,8 @@ void accumulate_fixed_blocks(
     }
 }
 
-} // anonymous namespace
-
 template <class ResultHandler, class Scaler>
-void pq4_accumulate_loop(
+void pq4_accumulate_loop_fixed_scaler(
         int nq,
         size_t nb,
         int bbs,
@@ -172,39 +170,55 @@ void pq4_accumulate_loop(
 #undef DISPATCH
 }
 
-// explicit template instantiations
-
-#define INSTANTIATE_ACCUMULATE(TH, C, with_id_map, S)         \
-    template void pq4_accumulate_loop<TH<C, with_id_map>, S>( \
-            int,                                              \
-            size_t,                                           \
-            int,                                              \
-            int,                                              \
-            const uint8_t*,                                   \
-            const uint8_t*,                                   \
-            TH<C, with_id_map>&,                              \
-            const S&);
-
-using DS = DummyScaler;
-using NS = NormTableScaler;
-
-#define INSTANTIATE_3(C, with_id_map)                               \
-    INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map, DS) \
-    INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map, DS)         \
-    INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map, DS)    \
-                                                                    \
-    INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map, NS) \
-    INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map, NS)         \
-    INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map, NS)
-
-using Csi = CMax<uint16_t, int>;
-INSTANTIATE_3(Csi, false);
-using CsiMin = CMin<uint16_t, int>;
-INSTANTIATE_3(CsiMin, false);
-
-using Csl = CMax<uint16_t, int64_t>;
-INSTANTIATE_3(Csl, true);
-using CslMin = CMin<uint16_t, int64_t>;
-INSTANTIATE_3(CslMin, true);
+template <class ResultHandler>
+void pq4_accumulate_loop_fixed_handler(
+        int nq,
+        size_t nb,
+        int bbs,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const NormTableScaler* scaler) {
+    if (scaler) {
+        pq4_accumulate_loop_fixed_scaler(
+                nq, nb, bbs, nsq, codes, LUT, res, *scaler);
+    } else {
+        DummyScaler dscaler;
+        pq4_accumulate_loop_fixed_scaler(
+                nq, nb, bbs, nsq, codes, LUT, res, dscaler);
+    }
+}
+
+struct Run_pq4_accumulate_loop {
+    template <class ResultHandler>
+    void f(ResultHandler& res,
+           int nq,
+           size_t nb,
+           int bbs,
+           int nsq,
+           const uint8_t* codes,
+           const uint8_t* LUT,
+           const NormTableScaler* scaler) {
+        pq4_accumulate_loop_fixed_handler(
+                nq, nb, bbs, nsq, codes, LUT, res, scaler);
+    }
+};
+
+} // anonymous namespace
+
+void pq4_accumulate_loop(
+        int nq,
+        size_t nb,
+        int bbs,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        SIMDResultHandler& res,
+        const NormTableScaler* scaler) {
+    Run_pq4_accumulate_loop consumer;
+    dispatch_SIMDResultHanlder(
+            res, consumer, nq, nb, bbs, nsq, codes, LUT, scaler);
+}
 
 } // namespace faiss
diff --git a/faiss/impl/pq4_fast_scan_search_qbs.cpp b/faiss/impl/pq4_fast_scan_search_qbs.cpp
index 50c0f6217b..bf2ccd1f76 100644
--- a/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ b/faiss/impl/pq4_fast_scan_search_qbs.cpp
@@ -14,6 +14,9 @@
 
 namespace faiss {
 
+// declared in simd_result_handlers.h
+bool simd_result_handlers_accept_virtual = true;
+
 using namespace simd_result_handlers;
 
 /************************************************************
@@ -28,6 +31,8 @@ namespace {
  * writes results in a ResultHandler
  */
 
+#ifndef __AVX512F__
+
 template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
         int nsq,
@@ -108,6 +113,451 @@ void kernel_accumulate_block(
     }
 }
 
+#else
+
+// a special version for NQ=1.
+// Despite the function being large in the text form, it compiles to a very
+//    compact assembler code.
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nq1(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // NQ is kept in order to match the similarity to baseline function
+    constexpr int NQ = 1;
+    // distance accumulators. We can accept more for NQ=1
+    // layout: accu[q][b]: distance accumulator for vectors 32*b..32*b+15
+    simd32uint16 accu[NQ][4];
+    // layout: accu[q][b]: distance accumulator for vectors 32*b+16..32*b+31
+    simd32uint16 accu1[NQ][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+            accu1[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_8 = (nsq_minus_nscale / 8) * 8;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(c1lo);
+                simd64uint8 res1 = lut.lookup_4_lanes(c1hi);
+
+                accu1[q][0] += simd32uint16(res0);
+                accu1[q][1] += simd32uint16(res0) >> 8;
+
+                accu1[q][2] += simd32uint16(res1);
+                accu1[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nsq_minus_nscale_8 != nsq_minus_nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = lut.lookup_4_lanes(clo);
+            simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+            accu[q][0] += simd32uint16(res0);
+            accu[q][1] += simd32uint16(res0) >> 8;
+
+            accu[q][2] += simd32uint16(res1);
+            accu[q][3] += simd32uint16(res1) >> 8;
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_8 = (nscale / 8) * 8;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, c1lo);
+                accu1[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu1[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, c1hi);
+                accu1[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu1[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nscale_8 != nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+            accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+            simd64uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b] += accu1[q][b];
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+// general-purpose case
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nqx(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // dummy alloc to keep the windows compiler happy
+    constexpr int NQA = NQ > 0 ? NQ : 1;
+    // distance accumulators
+    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
+    simd32uint16 accu[NQA][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 4
+    for (int sq = 0; sq < nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    if constexpr (NQ == 1) {
+        kernel_accumulate_block_avx512_nq1<ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    } else {
+        kernel_accumulate_block_avx512_nqx<NQ, ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    }
+}
+
+#endif
+
 // handle at most 4 blocks of queries
 template <int QBS, class ResultHandler, class Scaler>
 void accumulate_q_4step(
@@ -194,10 +644,8 @@ void accumulate(
 #undef DISPATCH
 }
 
-} // namespace
-
 template <class ResultHandler, class Scaler>
-void pq4_accumulate_loop_qbs(
+void pq4_accumulate_loop_qbs_fixed_scaler(
         int qbs,
         size_t ntotal2,
         int nsq,
@@ -272,49 +720,39 @@ void pq4_accumulate_loop_qbs(
     }
 }
 
-// explicit template instantiations
-
-#define INSTANTIATE_ACCUMULATE_Q(RH)                            \
-    template void pq4_accumulate_loop_qbs<RH, DummyScaler>(     \
-            int,                                                \
-            size_t,                                             \
-            int,                                                \
-            const uint8_t*,                                     \
-            const uint8_t*,                                     \
-            RH&,                                                \
-            const DummyScaler&);                                \
-    template void pq4_accumulate_loop_qbs<RH, NormTableScaler>( \
-            int,                                                \
-            size_t,                                             \
-            int,                                                \
-            const uint8_t*,                                     \
-            const uint8_t*,                                     \
-            RH&,                                                \
-            const NormTableScaler&);
-
-using Csi = CMax<uint16_t, int>;
-INSTANTIATE_ACCUMULATE_Q(SingleResultHandler<Csi>)
-INSTANTIATE_ACCUMULATE_Q(HeapHandler<Csi>)
-INSTANTIATE_ACCUMULATE_Q(ReservoirHandler<Csi>)
-using Csi2 = CMin<uint16_t, int>;
-INSTANTIATE_ACCUMULATE_Q(SingleResultHandler<Csi2>)
-INSTANTIATE_ACCUMULATE_Q(HeapHandler<Csi2>)
-INSTANTIATE_ACCUMULATE_Q(ReservoirHandler<Csi2>)
-
-using Cfl = CMax<uint16_t, int64_t>;
-using HHCsl = HeapHandler<Cfl, true>;
-using RHCsl = ReservoirHandler<Cfl, true>;
-using SHCsl = SingleResultHandler<Cfl, true>;
-INSTANTIATE_ACCUMULATE_Q(HHCsl)
-INSTANTIATE_ACCUMULATE_Q(RHCsl)
-INSTANTIATE_ACCUMULATE_Q(SHCsl)
-using Cfl2 = CMin<uint16_t, int64_t>;
-using HHCsl2 = HeapHandler<Cfl2, true>;
-using RHCsl2 = ReservoirHandler<Cfl2, true>;
-using SHCsl2 = SingleResultHandler<Cfl2, true>;
-INSTANTIATE_ACCUMULATE_Q(HHCsl2)
-INSTANTIATE_ACCUMULATE_Q(RHCsl2)
-INSTANTIATE_ACCUMULATE_Q(SHCsl2)
+struct Run_pq4_accumulate_loop_qbs {
+    template <class ResultHandler>
+    void f(ResultHandler& res,
+           int qbs,
+           size_t nb,
+           int nsq,
+           const uint8_t* codes,
+           const uint8_t* LUT,
+           const NormTableScaler* scaler) {
+        if (scaler) {
+            pq4_accumulate_loop_qbs_fixed_scaler(
+                    qbs, nb, nsq, codes, LUT, res, *scaler);
+        } else {
+            DummyScaler dummy;
+            pq4_accumulate_loop_qbs_fixed_scaler(
+                    qbs, nb, nsq, codes, LUT, res, dummy);
+        }
+    }
+};
+
+} // namespace
+
+void pq4_accumulate_loop_qbs(
+        int qbs,
+        size_t nb,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        SIMDResultHandler& res,
+        const NormTableScaler* scaler) {
+    Run_pq4_accumulate_loop_qbs consumer;
+    dispatch_SIMDResultHanlder(res, consumer, qbs, nb, nsq, codes, LUT, scaler);
+}
 
 /***************************************************************
  * Packing functions
diff --git a/faiss/impl/residual_quantizer_encode_steps.cpp b/faiss/impl/residual_quantizer_encode_steps.cpp
index d28537a577..8db6f9e5f7 100644
--- a/faiss/impl/residual_quantizer_encode_steps.cpp
+++ b/faiss/impl/residual_quantizer_encode_steps.cpp
@@ -292,8 +292,8 @@ void beam_search_encode_step(
                     cent_ids.data() + i * beam_size * new_beam_size;
 
             // here we could be a tad more efficient by merging sorted arrays
-            for (int i = 0; i < new_beam_size; i++) {
-                new_distances_i[i] = C::neutral();
+            for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
+                new_distances_i[i_2] = C::neutral();
             }
             std::vector<int> perm(new_beam_size, -1);
             heap_addn<C>(
@@ -325,8 +325,8 @@ void beam_search_encode_step(
             const float* cent_distances_i =
                     cent_distances.data() + i * beam_size * K;
             // then we have to select the best results
-            for (int i = 0; i < new_beam_size; i++) {
-                new_distances_i[i] = C::neutral();
+            for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
+                new_distances_i[i_2] = C::neutral();
             }
             std::vector<int> perm(new_beam_size, -1);
 
@@ -558,8 +558,8 @@ void beam_search_encode_step_tab(
         const float* cent_distances_i = cent_distances.data();
 
         // then we have to select the best results
-        for (int i = 0; i < new_beam_size; i++) {
-            new_distances_i[i] = C::neutral();
+        for (int i_2 = 0; i_2 < new_beam_size; i_2++) {
+            new_distances_i[i_2] = C::neutral();
         }
         std::vector<int> perm(new_beam_size, -1);
 
diff --git a/faiss/impl/simd_result_handlers.h b/faiss/impl/simd_result_handlers.h
index f2b302b3d3..2fa18fa340 100644
--- a/faiss/impl/simd_result_handlers.h
+++ b/faiss/impl/simd_result_handlers.h
@@ -14,40 +14,87 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/simdlib.h>
 
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
 #include <faiss/utils/partitioning.h>
 
 /** This file contains callbacks for kernels that compute distances.
- *
- * The SIMDResultHandler object is intended to be templated and inlined.
- * Methods:
- * - handle(): called when 32 distances are computed and provided in two
- *   simd16uint16. (q, b) indicate which entry it is in the block.
- * - set_block_origin(): set the sub-matrix that is being computed
  */
 
 namespace faiss {
 
+struct SIMDResultHandler {
+    // used to dispatch templates
+    bool is_CMax = false;
+    uint8_t sizeof_ids = 0;
+    bool with_fields = false;
+
+    /**  called when 32 distances are computed and provided in two
+     *   simd16uint16. (q, b) indicate which entry it is in the block. */
+    virtual void handle(
+            size_t q,
+            size_t b,
+            simd16uint16 d0,
+            simd16uint16 d1) = 0;
+
+    /// set the sub-matrix that is being computed
+    virtual void set_block_origin(size_t i0, size_t j0) = 0;
+
+    virtual ~SIMDResultHandler() {}
+};
+
+/* Result handler that will return float resutls eventually */
+struct SIMDResultHandlerToFloat : SIMDResultHandler {
+    size_t nq;     // number of queries
+    size_t ntotal; // ignore excess elements after ntotal
+
+    /// these fields are used mainly for the IVF variants (with_id_map=true)
+    const idx_t* id_map = nullptr; // map offset in invlist to vector id
+    const int* q_map = nullptr;    // map q to global query
+    const uint16_t* dbias =
+            nullptr; // table of biases to add to each query (for IVF L2 search)
+    const float* normalizers = nullptr; // size 2 * nq, to convert
+
+    SIMDResultHandlerToFloat(size_t nq, size_t ntotal)
+            : nq(nq), ntotal(ntotal) {}
+
+    virtual void begin(const float* norms) {
+        normalizers = norms;
+    }
+
+    // called at end of search to convert int16 distances to float, before
+    // normalizers are deallocated
+    virtual void end() {
+        normalizers = nullptr;
+    }
+};
+
+FAISS_API extern bool simd_result_handlers_accept_virtual;
+
 namespace simd_result_handlers {
 
-/** Dummy structure that just computes a checksum on results
+/** Dummy structure that just computes a chqecksum on results
  * (to avoid the computation to be optimized away) */
-struct DummyResultHandler {
+struct DummyResultHandler : SIMDResultHandler {
     size_t cs = 0;
 
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         cs += q * 123 + b * 789 + d0.get_scalar_0() + d1.get_scalar_0();
     }
 
-    void set_block_origin(size_t, size_t) {}
+    void set_block_origin(size_t, size_t) final {}
+
+    ~DummyResultHandler() {}
 };
 
 /** memorize results in a nq-by-nb matrix.
  *
  * j0 is the current upper-left block of the matrix
  */
-struct StoreResultHandler {
+struct StoreResultHandler : SIMDResultHandler {
     uint16_t* data;
     size_t ld; // total number of columns
     size_t i0 = 0;
@@ -55,32 +102,32 @@ struct StoreResultHandler {
 
     StoreResultHandler(uint16_t* data, size_t ld) : data(data), ld(ld) {}
 
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         size_t ofs = (q + i0) * ld + j0 + b * 32;
         d0.store(data + ofs);
         d1.store(data + ofs + 16);
     }
 
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
 };
 
 /** stores results in fixed-size matrix. */
 template <int NQ, int BB>
-struct FixedStorageHandler {
+struct FixedStorageHandler : SIMDResultHandler {
     simd16uint16 dis[NQ][BB];
     int i0 = 0;
 
-    void handle(int q, int b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         dis[q + i0][2 * b] = d0;
         dis[q + i0][2 * b + 1] = d1;
     }
 
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        assert(j0 == 0);
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        assert(j0_in == 0);
     }
 
     template <class OtherResultHandler>
@@ -91,30 +138,32 @@ struct FixedStorageHandler {
             }
         }
     }
+
+    virtual ~FixedStorageHandler() {}
 };
 
-/** Record origin of current block  */
+/** Result handler that compares distances to check if they need to be kept */
 template <class C, bool with_id_map>
-struct SIMDResultHandler {
+struct ResultHandlerCompare : SIMDResultHandlerToFloat {
     using TI = typename C::TI;
 
     bool disable = false;
 
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
-    size_t ntotal;  // ignore excess elements after ntotal
 
-    /// these fields are used mainly for the IVF variants (with_id_map=true)
-    const TI* id_map;      // map offset in invlist to vector id
-    const int* q_map;      // map q to global query
-    const uint16_t* dbias; // table of biases to add to each query
+    const IDSelector* sel;
 
-    explicit SIMDResultHandler(size_t ntotal)
-            : ntotal(ntotal), id_map(nullptr), q_map(nullptr), dbias(nullptr) {}
+    ResultHandlerCompare(size_t nq, size_t ntotal, const IDSelector* sel_in)
+            : SIMDResultHandlerToFloat(nq, ntotal), sel{sel_in} {
+        this->is_CMax = C::is_max;
+        this->sizeof_ids = sizeof(typename C::TI);
+        this->with_fields = with_id_map;
+    }
 
-    void set_block_origin(size_t i0, size_t j0) {
-        this->i0 = i0;
-        this->j0 = j0;
+    void set_block_origin(size_t i0_in, size_t j0_in) final {
+        this->i0 = i0_in;
+        this->j0 = j0_in;
     }
 
     // adjust handler data for IVF.
@@ -172,43 +221,42 @@ struct SIMDResultHandler {
         return lt_mask;
     }
 
-    virtual void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) = 0;
-
-    virtual ~SIMDResultHandler() {}
+    virtual ~ResultHandlerCompare() {}
 };
 
 /** Special version for k=1 */
 template <class C, bool with_id_map = false>
-struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
+struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
-
-    struct Result {
-        T val;
-        TI id;
-    };
-    std::vector<Result> results;
-
-    SingleResultHandler(size_t nq, size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal), results(nq) {
-        for (int i = 0; i < nq; i++) {
-            Result res = {C::neutral(), -1};
-            results[i] = res;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
+
+    std::vector<int16_t> idis;
+    float* dis;
+    int64_t* ids;
+
+    SingleResultHandler(
+            size_t nq,
+            size_t ntotal,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in), idis(nq), dis(dis), ids(ids) {
+        for (size_t i = 0; i < nq; i++) {
+            ids[i] = -1;
+            idis[i] = C::neutral();
         }
     }
 
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
 
         this->adjust_with_origin(q, d0, d1);
 
-        Result& res = results[q];
-        uint32_t lt_mask = this->get_lt_mask(res.val, b, d0, d1);
+        uint32_t lt_mask = this->get_lt_mask(idis[q], b, d0, d1);
         if (!lt_mask) {
             return;
         }
@@ -217,74 +265,87 @@ struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(res.val, dis)) {
-                res.val = dis;
-                res.id = this->adjust_id(b, j);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T d = d32tab[j];
+                    if (C::cmp(idis[q], d)) {
+                        idis[q] = d;
+                        ids[q] = real_idx;
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T d = d32tab[j];
+                if (C::cmp(idis[q], d)) {
+                    idis[q] = d;
+                    ids[q] = this->adjust_id(b, j);
+                }
             }
         }
     }
 
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < results.size(); q++) {
+    void end() {
+        for (size_t q = 0; q < this->nq; q++) {
             if (!normalizers) {
-                distances[q] = results[q].val;
+                dis[q] = idis[q];
             } else {
                 float one_a = 1 / normalizers[2 * q];
                 float b = normalizers[2 * q + 1];
-                distances[q] = b + results[q].val * one_a;
+                dis[q] = b + idis[q] * one_a;
             }
-            labels[q] = results[q].id;
         }
     }
 };
 
 /** Structure that collects results in a min- or max-heap */
 template <class C, bool with_id_map = false>
-struct HeapHandler : SIMDResultHandler<C, with_id_map> {
+struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
 
-    int nq;
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
+    std::vector<uint16_t> idis;
+    std::vector<TI> iids;
+    float* dis;
+    int64_t* ids;
 
     int64_t k; // number of results to keep
 
     HeapHandler(
-            int nq,
-            T* heap_dis_tab,
-            TI* heap_ids_tab,
-            size_t k,
-            size_t ntotal)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              nq(nq),
-              heap_dis_tab(heap_dis_tab),
-              heap_ids_tab(heap_ids_tab),
+            size_t nq,
+            size_t ntotal,
+            int64_t k,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
+              idis(nq * k),
+              iids(nq * k),
+              dis(dis),
+              ids(ids),
               k(k) {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
-            heap_heapify<C>(k, heap_dis_in, heap_ids_in);
-        }
+        heap_heapify<C>(k * nq, idis.data(), iids.data());
     }
 
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
 
         this->adjust_with_origin(q, d0, d1);
 
-        T* heap_dis = heap_dis_tab + q * k;
-        TI* heap_ids = heap_ids_tab + q * k;
+        T* heap_dis = idis.data() + q * k;
+        TI* heap_ids = iids.data() + q * k;
 
         uint16_t cur_thresh =
                 heap_dis[0] < 65536 ? (uint16_t)(heap_dis[0]) : 0xffff;
@@ -300,29 +361,41 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(heap_dis[0], dis)) {
-                int64_t idx = this->adjust_id(b, j);
-                heap_pop<C>(k, heap_dis, heap_ids);
-                heap_push<C>(k, heap_dis, heap_ids, dis, idx);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    if (C::cmp(heap_dis[0], dis)) {
+                        heap_replace_top<C>(
+                                k, heap_dis, heap_ids, dis, real_idx);
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                if (C::cmp(heap_dis[0], dis)) {
+                    int64_t idx = this->adjust_id(b, j);
+                    heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
+                }
             }
         }
     }
 
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
-        for (int q = 0; q < nq; q++) {
-            T* heap_dis_in = heap_dis_tab + q * k;
-            TI* heap_ids_in = heap_ids_tab + q * k;
+    void end() override {
+        for (size_t q = 0; q < this->nq; q++) {
+            T* heap_dis_in = idis.data() + q * k;
+            TI* heap_ids_in = iids.data() + q * k;
             heap_reorder<C>(k, heap_dis_in, heap_ids_in);
-            int64_t* heap_ids = labels + q * k;
-            float* heap_dis = distances + q * k;
+            float* heap_dis = dis + q * k;
+            int64_t* heap_ids = ids + q * k;
 
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
@@ -330,8 +403,8 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
                 b = normalizers[2 * q + 1];
             }
             for (int j = 0; j < k; j++) {
-                heap_ids[j] = heap_ids_in[j];
                 heap_dis[j] = heap_dis_in[j] * one_a + b;
+                heap_ids[j] = heap_ids_in[j];
             }
         }
     }
@@ -342,114 +415,49 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
  * Results are stored when they are below the threshold until the capacity is
  * reached. Then a partition sort is used to update the threshold. */
 
-namespace {
-
-uint64_t get_cy() {
-#ifdef MICRO_BENCHMARK
-    uint32_t high, low;
-    asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
-    return ((uint64_t)high << 32) | (low);
-#else
-    return 0;
-#endif
-}
-
-} // anonymous namespace
-
-template <class C>
-struct ReservoirTopN {
-    using T = typename C::T;
-    using TI = typename C::TI;
-
-    T* vals;
-    TI* ids;
-
-    size_t i;        // number of stored elements
-    size_t n;        // number of requested elements
-    size_t capacity; // size of storage
-    size_t cycles = 0;
-
-    T threshold; // current threshold
-
-    ReservoirTopN(size_t n, size_t capacity, T* vals, TI* ids)
-            : vals(vals), ids(ids), i(0), n(n), capacity(capacity) {
-        assert(n < capacity);
-        threshold = C::neutral();
-    }
-
-    void add(T val, TI id) {
-        if (C::cmp(threshold, val)) {
-            if (i == capacity) {
-                shrink_fuzzy();
-            }
-            vals[i] = val;
-            ids[i] = id;
-            i++;
-        }
-    }
-
-    /// shrink number of stored elements to n
-    void shrink_xx() {
-        uint64_t t0 = get_cy();
-        qselect(vals, ids, i, n);
-        i = n; // forget all elements above i = n
-        threshold = C::Crev::neutral();
-        for (size_t j = 0; j < n; j++) {
-            if (C::cmp(vals[j], threshold)) {
-                threshold = vals[j];
-            }
-        }
-        cycles += get_cy() - t0;
-    }
-
-    void shrink() {
-        uint64_t t0 = get_cy();
-        threshold = partition<C>(vals, ids, i, n);
-        i = n;
-        cycles += get_cy() - t0;
-    }
-
-    void shrink_fuzzy() {
-        uint64_t t0 = get_cy();
-        assert(i == capacity);
-        threshold = partition_fuzzy<C>(
-                vals, ids, capacity, n, (capacity + n) / 2, &i);
-        cycles += get_cy() - t0;
-    }
-};
-
 /** Handler built from several ReservoirTopN (one per query) */
 template <class C, bool with_id_map = false>
-struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
+struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
     using T = typename C::T;
     using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
 
     size_t capacity; // rounded up to multiple of 16
+
+    // where the final results will be written
+    float* dis;
+    int64_t* ids;
+
     std::vector<TI> all_ids;
     AlignedTable<T> all_vals;
-
     std::vector<ReservoirTopN<C>> reservoirs;
 
-    uint64_t times[4];
-
-    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in)
-            : SIMDResultHandler<C, with_id_map>(ntotal),
-              capacity((capacity_in + 15) & ~15),
-              all_ids(nq * capacity),
-              all_vals(nq * capacity) {
+    ReservoirHandler(
+            size_t nq,
+            size_t ntotal,
+            size_t k,
+            size_t cap,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
+              capacity((cap + 15) & ~15),
+              dis(dis),
+              ids(ids) {
         assert(capacity % 16 == 0);
-        for (size_t i = 0; i < nq; i++) {
+        all_ids.resize(nq * capacity);
+        all_vals.resize(nq * capacity);
+        for (size_t q = 0; q < nq; q++) {
             reservoirs.emplace_back(
-                    n,
+                    k,
                     capacity,
-                    all_vals.get() + i * capacity,
-                    all_ids.data() + i * capacity);
+                    all_vals.get() + q * capacity,
+                    all_ids.data() + q * capacity);
         }
-        times[0] = times[1] = times[2] = times[3] = 0;
     }
 
-    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) {
-        uint64_t t0 = get_cy();
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
         if (this->disable) {
             return;
         }
@@ -457,8 +465,6 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
 
         ReservoirTopN<C>& res = reservoirs[q];
         uint32_t lt_mask = this->get_lt_mask(res.threshold, b, d0, d1);
-        uint64_t t1 = get_cy();
-        times[0] += t1 - t0;
 
         if (!lt_mask) {
             return;
@@ -467,65 +473,315 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            res.add(dis, this->adjust_id(b, j));
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    res.add(dis, real_idx);
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                res.add(dis, this->adjust_id(b, j));
+            }
         }
-        times[1] += get_cy() - t1;
     }
 
-    void to_flat_arrays(
-            float* distances,
-            int64_t* labels,
-            const float* normalizers = nullptr) override {
+    void end() override {
         using Cf = typename std::conditional<
                 C::is_max,
                 CMax<float, int64_t>,
                 CMin<float, int64_t>>::type;
 
-        uint64_t t0 = get_cy();
-        uint64_t t3 = 0;
         std::vector<int> perm(reservoirs[0].n);
-        for (int q = 0; q < reservoirs.size(); q++) {
+        for (size_t q = 0; q < reservoirs.size(); q++) {
             ReservoirTopN<C>& res = reservoirs[q];
             size_t n = res.n;
 
             if (res.i > res.n) {
                 res.shrink();
             }
-            int64_t* heap_ids = labels + q * n;
-            float* heap_dis = distances + q * n;
+            int64_t* heap_ids = ids + q * n;
+            float* heap_dis = dis + q * n;
 
             float one_a = 1.0, b = 0.0;
             if (normalizers) {
                 one_a = 1 / normalizers[2 * q];
                 b = normalizers[2 * q + 1];
             }
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 perm[i] = i;
             }
             // indirect sort of result arrays
             std::sort(perm.begin(), perm.begin() + res.i, [&res](int i, int j) {
                 return C::cmp(res.vals[j], res.vals[i]);
             });
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 heap_dis[i] = res.vals[perm[i]] * one_a + b;
                 heap_ids[i] = res.ids[perm[i]];
             }
 
             // possibly add empty results
             heap_heapify<Cf>(n - res.i, heap_dis + res.i, heap_ids + res.i);
+        }
+    }
+};
+
+/** Result hanlder for range search. The difficulty is that the range distances
+ * have to be scaled using the scaler.
+ */
+
+template <class C, bool with_id_map = false>
+struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = ResultHandlerCompare<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq;
+
+    RangeSearchResult& rres;
+    float radius;
+    std::vector<uint16_t> thresholds;
+    std::vector<size_t> n_per_query;
+    size_t q0 = 0;
+
+    // we cannot use the RangeSearchPartialResult interface because queries can
+    // be performed by batches
+    struct Triplet {
+        idx_t q;
+        idx_t b;
+        uint16_t dis;
+    };
+    std::vector<Triplet> triplets;
+
+    RangeHandler(
+            RangeSearchResult& rres,
+            float radius,
+            size_t ntotal,
+            const IDSelector* sel_in)
+            : RHC(rres.nq, ntotal, sel_in), rres(rres), radius(radius) {
+        thresholds.resize(nq);
+        n_per_query.resize(nq + 1);
+    }
+
+    virtual void begin(const float* norms) override {
+        normalizers = norms;
+        for (int q = 0; q < nq; ++q) {
+            thresholds[q] =
+                    normalizers[2 * q] * (radius - normalizers[2 * q + 1]);
+        }
+    }
+
+    void handle(size_t q, size_t b, simd16uint16 d0, simd16uint16 d1) final {
+        if (this->disable) {
+            return;
+        }
+        this->adjust_with_origin(q, d0, d1);
 
-            t3 += res.cycles;
+        uint32_t lt_mask = this->get_lt_mask(thresholds[q], b, d0, d1);
+
+        if (!lt_mask) {
+            return;
+        }
+        ALIGNED(32) uint16_t d32tab[32];
+        d0.store(d32tab);
+        d1.store(d32tab + 16);
+
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+
+                auto real_idx = this->adjust_id(b, j);
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    n_per_query[q]++;
+                    triplets.push_back({idx_t(q + q0), real_idx, dis});
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                n_per_query[q]++;
+                triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+            }
+        }
+    }
+
+    void end() override {
+        memcpy(rres.lims, n_per_query.data(), sizeof(n_per_query[0]) * nq);
+        rres.do_allocation();
+        for (auto it = triplets.begin(); it != triplets.end(); ++it) {
+            size_t& l = rres.lims[it->q];
+            rres.distances[l] = it->dis;
+            rres.labels[l] = it->b;
+            l++;
+        }
+        memmove(rres.lims + 1, rres.lims, sizeof(*rres.lims) * rres.nq);
+        rres.lims[0] = 0;
+
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            for (size_t i = rres.lims[q]; i < rres.lims[q + 1]; i++) {
+                rres.distances[i] = rres.distances[i] * one_a + b;
+            }
         }
-        times[2] += get_cy() - t0;
-        times[3] += t3;
     }
 };
 
+#ifndef SWIG
+
+// handler for a subset of queries
+template <class C, bool with_id_map = false>
+struct PartialRangeHandler : RangeHandler<C, with_id_map> {
+    using T = typename C::T;
+    using TI = typename C::TI;
+    using RHC = RangeHandler<C, with_id_map>;
+    using RHC::normalizers;
+    using RHC::nq, RHC::q0, RHC::triplets, RHC::n_per_query;
+
+    RangeSearchPartialResult& pres;
+
+    PartialRangeHandler(
+            RangeSearchPartialResult& pres,
+            float radius,
+            size_t ntotal,
+            size_t q0,
+            size_t q1,
+            const IDSelector* sel_in)
+            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal, sel_in),
+              pres(pres) {
+        nq = q1 - q0;
+        this->q0 = q0;
+    }
+
+    // shift left n_per_query
+    void shift_n_per_query() {
+        memmove(n_per_query.data() + 1,
+                n_per_query.data(),
+                nq * sizeof(n_per_query[0]));
+        n_per_query[0] = 0;
+    }
+
+    // commit to partial result instead of full RangeResult
+    void end() override {
+        std::vector<typename RHC::Triplet> sorted_triplets(triplets.size());
+        for (int q = 0; q < nq; q++) {
+            n_per_query[q + 1] += n_per_query[q];
+        }
+        shift_n_per_query();
+
+        for (size_t i = 0; i < triplets.size(); i++) {
+            sorted_triplets[n_per_query[triplets[i].q - q0]++] = triplets[i];
+        }
+        shift_n_per_query();
+
+        size_t* lims = n_per_query.data();
+
+        for (int q = 0; q < nq; q++) {
+            float one_a = 1 / normalizers[2 * q];
+            float b = normalizers[2 * q + 1];
+            RangeQueryResult& qres = pres.new_result(q + q0);
+            for (size_t i = lims[q]; i < lims[q + 1]; i++) {
+                qres.add(
+                        sorted_triplets[i].dis * one_a + b,
+                        sorted_triplets[i].b);
+            }
+        }
+    }
+};
+
+#endif
+
+/********************************************************************************
+ * Dynamic dispatching function. The consumer should have a templatized method f
+ * that will be replaced with the actual SIMDResultHandler that is determined
+ * dynamically.
+ */
+
+template <class C, bool W, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedCW(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (auto resh = dynamic_cast<SingleResultHandler<C, W>*>(&res)) {
+        consumer.template f<SingleResultHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<HeapHandler<C, W>*>(&res)) {
+        consumer.template f<HeapHandler<C, W>>(*resh, args...);
+    } else if (auto resh = dynamic_cast<ReservoirHandler<C, W>*>(&res)) {
+        consumer.template f<ReservoirHandler<C, W>>(*resh, args...);
+    } else { // generic handler -- will not be inlined
+        FAISS_THROW_IF_NOT_FMT(
+                simd_result_handlers_accept_virtual,
+                "Running vitrual handler for %s",
+                typeid(res).name());
+        consumer.template f<SIMDResultHandler>(res, args...);
+    }
+}
+
+template <class C, class Consumer, class... Types>
+void dispatch_SIMDResultHanlder_fixedC(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.with_fields) {
+        dispatch_SIMDResultHanlder_fixedCW<C, true>(res, consumer, args...);
+    } else {
+        dispatch_SIMDResultHanlder_fixedCW<C, false>(res, consumer, args...);
+    }
+}
+
+template <class Consumer, class... Types>
+void dispatch_SIMDResultHanlder(
+        SIMDResultHandler& res,
+        Consumer& consumer,
+        Types... args) {
+    if (res.sizeof_ids == 0) {
+        if (auto resh = dynamic_cast<StoreResultHandler*>(&res)) {
+            consumer.template f<StoreResultHandler>(*resh, args...);
+        } else if (auto resh = dynamic_cast<DummyResultHandler*>(&res)) {
+            consumer.template f<DummyResultHandler>(*resh, args...);
+        } else { // generic path
+            FAISS_THROW_IF_NOT_FMT(
+                    simd_result_handlers_accept_virtual,
+                    "Running vitrual handler for %s",
+                    typeid(res).name());
+            consumer.template f<SIMDResultHandler>(res, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int>>(
+                    res, consumer, args...);
+        }
+    } else if (res.sizeof_ids == sizeof(int64_t)) {
+        if (res.is_CMax) {
+            dispatch_SIMDResultHanlder_fixedC<CMax<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        } else {
+            dispatch_SIMDResultHanlder_fixedC<CMin<uint16_t, int64_t>>(
+                    res, consumer, args...);
+        }
+    } else {
+        FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
+    }
+}
+
 } // namespace simd_result_handlers
 
 } // namespace faiss
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 5d7a505e09..d88fe7b393 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -140,8 +140,9 @@ std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
         {"SQ4", ScalarQuantizer::QT_4bit},
         {"SQ6", ScalarQuantizer::QT_6bit},
         {"SQfp16", ScalarQuantizer::QT_fp16},
+        {"SQbf16", ScalarQuantizer::QT_bf16},
 };
-const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16)";
+const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16|SQbf16)";
 
 std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nfloat", AdditiveQuantizer::ST_norm_float},
@@ -216,7 +217,7 @@ VectorTransform* parse_VectorTransform(const std::string& description, int d) {
         return new RemapDimensionsTransform(d, std::max(d_out, d), false);
     }
     return nullptr;
-};
+}
 
 /***************************************************************
  * Parse IndexIVF
diff --git a/faiss/index_io.h b/faiss/index_io.h
index 8d52ee1afd..3e77d0227c 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // I/O code for indexes
 
 #ifndef FAISS_INDEX_IO_H
@@ -35,9 +33,12 @@ struct IOReader;
 struct IOWriter;
 struct InvertedLists;
 
-void write_index(const Index* idx, const char* fname);
-void write_index(const Index* idx, FILE* f);
-void write_index(const Index* idx, IOWriter* writer);
+/// skip the storage for graph-based indexes
+const int IO_FLAG_SKIP_STORAGE = 1;
+
+void write_index(const Index* idx, const char* fname, int io_flags = 0);
+void write_index(const Index* idx, FILE* f, int io_flags = 0);
+void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
 
 void write_index_binary(const IndexBinary* idx, const char* fname);
 void write_index_binary(const IndexBinary* idx, FILE* f);
@@ -52,6 +53,12 @@ const int IO_FLAG_ONDISK_SAME_DIR = 4;
 const int IO_FLAG_SKIP_IVF_DATA = 8;
 // don't initialize precomputed table after loading
 const int IO_FLAG_SKIP_PRECOMPUTE_TABLE = 16;
+// don't compute the sdc table for PQ-based indices
+// this will prevent distances from being computed
+// between elements in the index. For indices like HNSWPQ,
+// this will prevent graph building because sdc
+// computations are required to construct the graph
+const int IO_FLAG_PQ_SKIP_SDC_TABLE = 32;
 // try to memmap data (useful to load an ArrayInvertedLists as an
 // OnDiskInvertedLists)
 const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
diff --git a/faiss/invlists/BlockInvertedLists.cpp b/faiss/invlists/BlockInvertedLists.cpp
index 6370d11871..dbdb0302dc 100644
--- a/faiss/invlists/BlockInvertedLists.cpp
+++ b/faiss/invlists/BlockInvertedLists.cpp
@@ -9,6 +9,7 @@
 
 #include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/impl/io_macros.h>
@@ -54,7 +55,9 @@ size_t BlockInvertedLists::add_entries(
     codes[list_no].resize(n_block * block_size);
     if (o % block_size == 0) {
         // copy whole blocks
-        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+        memcpy(&codes[list_no][o * packer->code_size],
+               code,
+               n_block * block_size);
     } else {
         FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
         std::vector<uint8_t> buffer(packer->code_size);
@@ -76,6 +79,29 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
+size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
+    idx_t nremove = 0;
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        std::vector<uint8_t> buffer(packer->code_size);
+        idx_t l = ids[i].size(), j = 0;
+        while (j < l) {
+            if (sel.is_member(ids[i][j])) {
+                l--;
+                ids[i][j] = ids[i][l];
+                packer->unpack_1(codes[i].data(), l, buffer.data());
+                packer->pack_1(buffer.data(), j, codes[i].data());
+            } else {
+                j++;
+            }
+        }
+        resize(i, l);
+        nremove += ids[i].size() - l;
+    }
+
+    return nremove;
+}
+
 const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
@@ -102,12 +128,6 @@ void BlockInvertedLists::update_entries(
         const idx_t*,
         const uint8_t*) {
     FAISS_THROW_MSG("not impemented");
-    /*
-    assert (list_no < nlist);
-    assert (n_entry + offset <= ids[list_no].size());
-    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
-    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
-    */
 }
 
 BlockInvertedLists::~BlockInvertedLists() {
diff --git a/faiss/invlists/BlockInvertedLists.h b/faiss/invlists/BlockInvertedLists.h
index 8d8df720bf..2b9cbba455 100644
--- a/faiss/invlists/BlockInvertedLists.h
+++ b/faiss/invlists/BlockInvertedLists.h
@@ -15,6 +15,7 @@
 namespace faiss {
 
 struct CodePacker;
+struct IDSelector;
 
 /** Inverted Lists that are organized by blocks.
  *
@@ -47,6 +48,8 @@ struct BlockInvertedLists : InvertedLists {
     size_t list_size(size_t list_no) const override;
     const uint8_t* get_codes(size_t list_no) const override;
     const idx_t* get_ids(size_t list_no) const override;
+    /// remove ids from the InvertedLists
+    size_t remove_ids(const IDSelector& sel);
 
     // works only on empty BlockInvertedLists
     // the codes should be of size ceil(n_entry / n_per_block) * block_size
diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
index 2b272922d5..dc2b92aa1c 100644
--- a/faiss/invlists/DirectMap.cpp
+++ b/faiss/invlists/DirectMap.cpp
@@ -15,6 +15,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -148,8 +149,12 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
     std::vector<idx_t> toremove(nlist);
 
     size_t nremove = 0;
-
+    BlockInvertedLists* block_invlists =
+            dynamic_cast<BlockInvertedLists*>(invlists);
     if (type == NoMap) {
+        if (block_invlists != nullptr) {
+            return block_invlists->remove_ids(sel);
+        }
         // exhaustive scan of IVF
 #pragma omp parallel for
         for (idx_t i = 0; i < nlist; i++) {
@@ -178,6 +183,9 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
             }
         }
     } else if (type == Hashtable) {
+        FAISS_THROW_IF_MSG(
+                block_invlists,
+                "remove with hashtable is not supported with BlockInvertedLists");
         const IDSelectorArray* sela =
                 dynamic_cast<const IDSelectorArray*>(&sel);
         FAISS_THROW_IF_NOT_MSG(
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 46f31e6286..c2bfa2cabc 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/invlists/InvertedLists.h>
 
 #include <cstdio>
@@ -24,17 +22,10 @@ InvertedListsIterator::~InvertedListsIterator() {}
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size), use_iterator(false) {}
+        : nlist(nlist), code_size(code_size) {}
 
 InvertedLists::~InvertedLists() {}
 
-bool InvertedLists::is_empty(size_t list_no) const {
-    return use_iterator
-            ? !std::unique_ptr<InvertedListsIterator>(get_iterator(list_no))
-                       ->is_available()
-            : list_size(list_no) == 0;
-}
-
 idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
     const idx_t* ids = get_ids(list_no);
@@ -58,7 +49,8 @@ const uint8_t* InvertedLists::get_single_code(size_t list_no, size_t offset)
 size_t InvertedLists::add_entry(
         size_t list_no,
         idx_t theid,
-        const uint8_t* code) {
+        const uint8_t* code,
+        void* /*inverted_list_context*/) {
     return add_entries(list_no, 1, &theid, code);
 }
 
@@ -76,10 +68,6 @@ void InvertedLists::reset() {
     }
 }
 
-InvertedListsIterator* InvertedLists::get_iterator(size_t /*list_no*/) const {
-    FAISS_THROW_MSG("get_iterator is not supported");
-}
-
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
@@ -229,6 +217,54 @@ size_t InvertedLists::compute_ntotal() const {
     return tot;
 }
 
+bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    if (use_iterator) {
+        return !std::unique_ptr<InvertedListsIterator>(
+                        get_iterator(list_no, inverted_list_context))
+                        ->is_available();
+    } else {
+        FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+        return list_size(list_no) == 0;
+    }
+}
+
+// implemnent iterator on top of get_codes / get_ids
+namespace {
+
+struct CodeArrayIterator : InvertedListsIterator {
+    size_t list_size;
+    size_t code_size;
+    InvertedLists::ScopedCodes codes;
+    InvertedLists::ScopedIds ids;
+    size_t idx = 0;
+
+    CodeArrayIterator(const InvertedLists* il, size_t list_no)
+            : list_size(il->list_size(list_no)),
+              code_size(il->code_size),
+              codes(il, list_no),
+              ids(il, list_no) {}
+
+    bool is_available() const override {
+        return idx < list_size;
+    }
+    void next() override {
+        idx++;
+    }
+    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+        return {ids[idx], codes.get() + code_size * idx};
+    }
+};
+
+} // namespace
+
+InvertedListsIterator* InvertedLists::get_iterator(
+        size_t list_no,
+        void* inverted_list_context) const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return new CodeArrayIterator(this, list_no);
+}
+
 /*****************************************
  * ArrayInvertedLists implementation
  ******************************************/
@@ -260,6 +296,12 @@ size_t ArrayInvertedLists::list_size(size_t list_no) const {
     return ids[list_no].size();
 }
 
+bool ArrayInvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return ids[list_no].size() == 0;
+}
+
 const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
     assert(list_no < nlist);
     return codes[list_no].data();
@@ -437,7 +479,7 @@ idx_t translate_list_no(const SliceInvertedLists* sil, idx_t list_no) {
     return list_no + sil->i0;
 }
 
-}; // namespace
+} // namespace
 
 SliceInvertedLists::SliceInvertedLists(
         const InvertedLists* il,
@@ -522,7 +564,7 @@ idx_t sum_il_sizes(int nil, const InvertedLists** ils_in) {
     return tot;
 }
 
-}; // namespace
+} // namespace
 
 VStackInvertedLists::VStackInvertedLists(int nil, const InvertedLists** ils_in)
         : ReadOnlyInvertedLists(
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index c4d681452b..b24700fad1 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -37,7 +37,9 @@ struct InvertedListsIterator {
 struct InvertedLists {
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
-    bool use_iterator;
+
+    /// request to use iterator rather than get_codes / get_ids
+    bool use_iterator = false;
 
     InvertedLists(size_t nlist, size_t code_size);
 
@@ -50,15 +52,9 @@ struct InvertedLists {
     /*************************
      *  Read only functions */
 
-    // check if the list is empty
-    bool is_empty(size_t list_no) const;
-
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
-    /// get iterable for lists that use_iterator
-    virtual InvertedListsIterator* get_iterator(size_t list_no) const;
-
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -90,11 +86,27 @@ struct InvertedLists {
     /// a list can be -1 hence the signed long
     virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
 
+    /*****************************************
+     * Iterator interface (with context)     */
+
+    /// check if the list is empty
+    virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const;
+
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const;
+
     /*************************
      * writing functions     */
 
     /// add one entry to an inverted list
-    virtual size_t add_entry(size_t list_no, idx_t theid, const uint8_t* code);
+    virtual size_t add_entry(
+            size_t list_no,
+            idx_t theid,
+            const uint8_t* code,
+            void* inverted_list_context = nullptr);
 
     virtual size_t add_entries(
             size_t list_no,
@@ -256,6 +268,9 @@ struct ArrayInvertedLists : InvertedLists {
     /// permute the inverted lists, map maps new_id to old_id
     void permute_invlists(const idx_t* map);
 
+    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const override;
+
     ~ArrayInvertedLists() override;
 };
 
diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
index 81f76dbf7f..8565572a9b 100644
--- a/faiss/invlists/OnDiskInvertedLists.cpp
+++ b/faiss/invlists/OnDiskInvertedLists.cpp
@@ -394,8 +394,8 @@ const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
         return nullptr;
     }
 
-    return (
-        const idx_t*)(ptr + lists[list_no].offset + code_size * lists[list_no].capacity);
+    return (const idx_t*)(ptr + lists[list_no].offset +
+                          code_size * lists[list_no].capacity);
 }
 
 void OnDiskInvertedLists::update_entries(
@@ -407,7 +407,7 @@ void OnDiskInvertedLists::update_entries(
     FAISS_THROW_IF_NOT(!read_only);
     if (n_entry == 0)
         return;
-    const List& l = lists[list_no];
+    [[maybe_unused]] const List& l = lists[list_no];
     assert(n_entry + offset <= l.size);
     idx_t* ids = const_cast<idx_t*>(get_ids(list_no));
     memcpy(ids + offset, ids_in, sizeof(ids_in[0]) * n_entry);
@@ -565,15 +565,16 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
 /*****************************************
  * Compact form
  *****************************************/
-
-size_t OnDiskInvertedLists::merge_from(
+size_t OnDiskInvertedLists::merge_from_multiple(
         const InvertedLists** ils,
         int n_il,
+        bool shift_ids,
         bool verbose) {
     FAISS_THROW_IF_NOT_MSG(
             totsize == 0, "works only on an empty InvertedLists");
 
     std::vector<size_t> sizes(nlist);
+    std::vector<size_t> shift_id_offsets(n_il);
     for (int i = 0; i < n_il; i++) {
         const InvertedLists* il = ils[i];
         FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
@@ -581,6 +582,10 @@ size_t OnDiskInvertedLists::merge_from(
         for (size_t j = 0; j < nlist; j++) {
             sizes[j] += il->list_size(j);
         }
+
+        size_t il_totsize = il->compute_ntotal();
+        shift_id_offsets[i] =
+                (shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
     }
 
     size_t cums = 0;
@@ -605,11 +610,21 @@ size_t OnDiskInvertedLists::merge_from(
             const InvertedLists* il = ils[i];
             size_t n_entry = il->list_size(j);
             l.size += n_entry;
+            ScopedIds scope_ids(il, j);
+            const idx_t* scope_ids_data = scope_ids.get();
+            std::vector<idx_t> new_ids;
+            if (shift_ids) {
+                new_ids.resize(n_entry);
+                for (size_t k = 0; k < n_entry; k++) {
+                    new_ids[k] = scope_ids[k] + shift_id_offsets[i];
+                }
+                scope_ids_data = new_ids.data();
+            }
             update_entries(
                     j,
                     l.size - n_entry,
                     n_entry,
-                    ScopedIds(il, j).get(),
+                    scope_ids_data,
                     ScopedCodes(il, j).get());
         }
         assert(l.size == l.capacity);
@@ -638,7 +653,7 @@ size_t OnDiskInvertedLists::merge_from(
 size_t OnDiskInvertedLists::merge_from_1(
         const InvertedLists* ils,
         bool verbose) {
-    return merge_from(&ils, 1, verbose);
+    return merge_from_multiple(&ils, 1, verbose);
 }
 
 void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {
diff --git a/faiss/invlists/OnDiskInvertedLists.h b/faiss/invlists/OnDiskInvertedLists.h
index 98cb653a7a..01c7f3481e 100644
--- a/faiss/invlists/OnDiskInvertedLists.h
+++ b/faiss/invlists/OnDiskInvertedLists.h
@@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists {
 
     // copy all inverted lists into *this, in compact form (without
     // allocating slots)
-    size_t merge_from(
+    size_t merge_from_multiple(
             const InvertedLists** ils,
             int n_il,
+            bool shift_ids = false,
             bool verbose = false);
 
     /// same as merge_from for a single invlist
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 8bca710f5f..0073c20e04 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -38,6 +38,11 @@ macro(configure_swigfaiss source)
     set_source_files_properties(${source} PROPERTIES
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
+    if (FAISS_ENABLE_RAFT)
+      set_property(SOURCE ${source} APPEND PROPERTY
+        COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
+      )
+    endif()
   endif()
 endmacro()
 
@@ -67,11 +72,20 @@ else()
   find_package(faiss REQUIRED)
 endif()
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+swig_add_library(swigfaiss
+  TYPE MODULE
+  LANGUAGE python
+  SOURCES swigfaiss.swig
+)
+else ()
 swig_add_library(swigfaiss
   TYPE SHARED
   LANGUAGE python
   SOURCES swigfaiss.swig
 )
+endif()
+
 set_property(TARGET swigfaiss PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
 
 set_property(SOURCE swigfaiss_avx2.swig
@@ -160,6 +174,10 @@ set_property(TARGET faiss_python_callbacks
   PROPERTY POSITION_INDEPENDENT_CODE ON
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+target_link_libraries(faiss_python_callbacks PRIVATE faiss)
+endif()
+
 # Hack so that python_callbacks.h can be included as
 # `#include <faiss/python/python_callbacks.h>`.
 target_include_directories(faiss_python_callbacks PRIVATE ${PROJECT_SOURCE_DIR}/../..)
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 95be4254dc..ce4b42c618 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -292,10 +292,10 @@ def range_search_with_parameters(index, x, radius, params=None, output_stats=Fal
 ###########################################
 
 
-def serialize_index(index):
+def serialize_index(index, io_flags=0):
     """ convert an index to a numpy uint8 array  """
     writer = VectorIOWriter()
-    write_index(index, writer)
+    write_index(index, writer, io_flags)
     return vector_to_array(writer.data)
 
 
@@ -316,3 +316,14 @@ def deserialize_index_binary(data):
     reader = VectorIOReader()
     copy_array_to_vector(data, reader.data)
     return read_index_binary(reader)
+
+
+class TimeoutGuard:
+    def __init__(self, timeout_in_seconds: float):
+        self.timeout = timeout_in_seconds
+
+    def __enter__(self):
+        TimeoutCallback.reset(self.timeout)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        PythonInterruptCallback.reset()
diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 4a6808d286..4af2345009 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -956,10 +956,44 @@ def replacement_remove_ids(self, x):
             sel = IDSelectorBatch(x.size, swig_ptr(x))
         return self.remove_ids_c(sel)
 
+    def replacement_assign(self, x, k, labels=None):
+        """Find the k nearest neighbors of the set of vectors x in the index.
+        This is the same as the `search` method, but discards the distances.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be uint8.
+        k : int
+            Number of nearest neighbors.
+        labels : array_like, optional
+            Labels array to store the results.
+
+        Returns
+        -------
+        labels: array_like
+            Labels of the nearest neighbors, shape (n, k).
+            When not enough results are found, the label is set to -1
+        """
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d == self.code_size
+        assert k > 0
+
+        if labels is None:
+            labels = np.empty((n, k), dtype=np.int64)
+        else:
+            assert labels.shape == (n, k)
+
+        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
+        return labels
+
     replace_method(the_class, 'add', replacement_add)
     replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
     replace_method(the_class, 'train', replacement_train)
     replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'assign', replacement_assign)
     replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
index d7fd05bc9f..a037b0280f 100644
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@@ -330,7 +330,7 @@ def lookup(self, keys):
 # KNN function
 ######################################################
 
-def knn(xq, xb, k, metric=METRIC_L2):
+def knn(xq, xb, k, metric=METRIC_L2, metric_arg=0.0):
     """
     Compute the k nearest neighbors of a vector without constructing an index
 
@@ -374,10 +374,16 @@ def knn(xq, xb, k, metric=METRIC_L2):
             swig_ptr(xq), swig_ptr(xb),
             d, nq, nb, k, swig_ptr(D), swig_ptr(I)
         )
-    else:
-        raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
+    else: 
+        knn_extra_metrics(
+            swig_ptr(xq), swig_ptr(xb),
+            d, nq, nb, metric, metric_arg, k, 
+            swig_ptr(D), swig_ptr(I)
+        )
+
     return D, I
 
+
 def knn_hamming(xq, xb, k, variant="hc"):
     """
     Compute the k nearest neighbors of a set of vectors without constructing an index.
diff --git a/faiss/python/loader.py b/faiss/python/loader.py
index eb60bf6800..8cc97f2f44 100644
--- a/faiss/python/loader.py
+++ b/faiss/python/loader.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from distutils.version import LooseVersion
+from packaging.version import Version
 import platform
 import subprocess
 import logging
@@ -25,7 +25,7 @@ def supported_instruction_sets():
     {"NEON", "ASIMD", ...}
     """
     import numpy
-    if LooseVersion(numpy.__version__) >= "1.19":
+    if Version(numpy.__version__) >= Version("1.19"):
         # use private API as next-best thing until numpy/numpy#18058 is solved
         from numpy.core._multiarray_umath import __cpu_features__
         # __cpu_features__ is a dictionary with CPU features
diff --git a/faiss/python/python_callbacks.cpp b/faiss/python/python_callbacks.cpp
index c99c1a3f77..06b5c18cfc 100644
--- a/faiss/python/python_callbacks.cpp
+++ b/faiss/python/python_callbacks.cpp
@@ -22,7 +22,7 @@ struct PyThreadLock {
     }
 };
 
-}; // namespace
+} // namespace
 
 /***********************************************************
  * Callbacks for IO reader and writer
@@ -46,7 +46,7 @@ size_t PyCallbackIOWriter::operator()(
         size_t wi = ws > bs ? bs : ws;
         PyObject* result = PyObject_CallFunction(
                 callback, "(N)", PyBytes_FromStringAndSize(ptr, wi));
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("py err");
         }
         // TODO check nb of bytes written
@@ -77,7 +77,7 @@ size_t PyCallbackIOReader::operator()(void* ptrv, size_t size, size_t nitems) {
     while (rs > 0) {
         size_t ri = rs > bs ? bs : rs;
         PyObject* result = PyObject_CallFunction(callback, "(n)", ri);
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("propagate py error");
         }
         if (!PyBytes_Check(result)) {
@@ -122,7 +122,7 @@ bool PyCallbackIDSelector::is_member(faiss::idx_t id) const {
     FAISS_THROW_IF_NOT((id >> 32) == 0);
     PyThreadLock gil;
     PyObject* result = PyObject_CallFunction(callback, "(n)", int(id));
-    if (result == NULL) {
+    if (result == nullptr) {
         FAISS_THROW_MSG("propagate py error");
     }
     bool b = PyObject_IsTrue(result);
diff --git a/faiss/python/setup.py b/faiss/python/setup.py
index 1c9101290d..939aeeffbe 100644
--- a/faiss/python/setup.py
+++ b/faiss/python/setup.py
@@ -60,7 +60,7 @@
 """
 setup(
     name='faiss',
-    version='1.7.4',
+    version='1.8.0',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',
@@ -69,7 +69,7 @@
     license='MIT',
     keywords='search nearest neighbors',
 
-    install_requires=['numpy'],
+    install_requires=['numpy', 'packaging'],
     packages=['faiss', 'faiss.contrib'],
     package_data={
         'faiss': ['*.so', '*.pyd'],
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 3d6f94604a..74a371f6cd 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -81,6 +81,9 @@ typedef uint64_t size_t;
 #include <faiss/IndexFastScan.h>
 #include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexPQFastScan.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/impl/simd_result_handlers.h>
+
 #include <faiss/IndexIVFFastScan.h>
 #include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFPQFastScan.h>
@@ -301,6 +304,7 @@ void gpu_sync_all_devices();
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuClonerOptions.h>
 #include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -490,6 +494,11 @@ void gpu_sync_all_devices()
 %include  <faiss/IndexFastScan.h>
 %include  <faiss/IndexAdditiveQuantizerFastScan.h>
 %include  <faiss/IndexPQFastScan.h>
+
+// NOTE(matthijs) let's not go into wrapping simdlib
+struct faiss::simd16uint16 {};
+
+%include  <faiss/impl/simd_result_handlers.h>
 %include  <faiss/IndexIVFFastScan.h>
 %include  <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 %include  <faiss/IndexIVFIndependentQuantizer.h>
@@ -549,6 +558,9 @@ void gpu_sync_all_devices()
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
+#ifdef FAISS_ENABLE_RAFT
+%include  <faiss/gpu/GpuIndexCagra.h>
+#endif
 %include  <faiss/gpu/GpuIndexFlat.h>
 %include  <faiss/gpu/GpuIndexIVF.h>
 %include  <faiss/gpu/GpuIndexIVFPQ.h>
@@ -665,6 +677,9 @@ void gpu_sync_all_devices()
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
+#ifdef FAISS_ENABLE_RAFT
+    DOWNCAST_GPU ( GpuIndexCagra )
+#endif
     DOWNCAST_GPU ( GpuIndexIVFPQ )
     DOWNCAST_GPU ( GpuIndexIVFFlat )
     DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )
@@ -1014,14 +1029,17 @@ PyObject *swig_ptr (PyObject *a)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
     }
     if(PyArray_TYPE(ao) == NPY_UINT64) {
-#ifdef SWIGWORDSIZE64
+    // Convert npy64 either long or long long  and it depends on how compiler define int64_t.
+    // In the 64bit machine, typically the int64_t should be long but it is not hold for Apple osx.
+    // In this case, we want to convert npy64 to long_Long in osx
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
 #endif
     }
     if(PyArray_TYPE(ao) == NPY_INT64) {
-#ifdef SWIGWORDSIZE64
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);
@@ -1030,7 +1048,9 @@ PyObject *swig_ptr (PyObject *a)
     PyErr_SetString(PyExc_ValueError, "did not recognize array type");
     return NULL;
 }
+%}
 
+%inline %{
 
 struct PythonInterruptCallback: faiss::InterruptCallback {
 
@@ -1045,18 +1065,18 @@ struct PythonInterruptCallback: faiss::InterruptCallback {
         return err == -1;
     }
 
+    static void reset() {
+        faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
+    }
 };
 
-
 %}
 
-
 %init %{
     /* needed, else crash at runtime */
     import_array();
 
-    faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
-
+    PythonInterruptCallback::reset();
 %}
 
 // return a pointer usable as input for functions that expect pointers
@@ -1113,15 +1133,8 @@ int * cast_integer_to_int_ptr (int64_t x) {
 void * cast_integer_to_void_ptr (int64_t x) {
     return (void*)x;
 }
-
 %}
 
-
-
-
-
-
-
 %inline %{
     void wait() {
         // in gdb, use return to get out of this function
diff --git a/faiss/utils/bf16.h b/faiss/utils/bf16.h
new file mode 100644
index 0000000000..ff0fbe898b
--- /dev/null
+++ b/faiss/utils/bf16.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+
+namespace {
+
+union fp32_bits {
+    uint32_t as_u32;
+    float as_f32;
+};
+
+} // namespace
+
+inline uint16_t encode_bf16(const float f) {
+    // Round off
+    fp32_bits fp;
+    fp.as_f32 = f;
+    return static_cast<uint16_t>((fp.as_u32 + 0x8000) >> 16);
+}
+
+inline float decode_bf16(const uint16_t v) {
+    fp32_bits fp;
+    fp.as_u32 = (uint32_t(v) << 16);
+    return fp.as_f32;
+}
+
+} // namespace faiss
diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
index 5b66158c09..74b56bcc87 100644
--- a/faiss/utils/distances.cpp
+++ b/faiss/utils/distances.cpp
@@ -5,13 +5,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/utils/distances.h>
 
 #include <algorithm>
 #include <cassert>
 #include <cmath>
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
 
@@ -131,17 +130,18 @@ void fvec_renorm_L2(size_t d, size_t nx, float* __restrict x) {
 namespace {
 
 /* Find the nearest neighbors for nx queries in a set of ny vectors */
-template <class ResultHandler, bool use_sel = false>
+template <class BlockResultHandler, bool use_sel = false>
 void exhaustive_inner_product_seq(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res,
+        BlockResultHandler& res,
         const IDSelector* sel = nullptr) {
-    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    using SingleResultHandler =
+            typename BlockResultHandler::SingleResultHandler;
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
@@ -167,17 +167,18 @@ void exhaustive_inner_product_seq(
     }
 }
 
-template <class ResultHandler, bool use_sel = false>
+template <class BlockResultHandler, bool use_sel = false>
 void exhaustive_L2sqr_seq(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res,
+        BlockResultHandler& res,
         const IDSelector* sel = nullptr) {
-    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    using SingleResultHandler =
+            typename BlockResultHandler::SingleResultHandler;
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
@@ -202,14 +203,14 @@ void exhaustive_L2sqr_seq(
 }
 
 /** Find the nearest neighbors for nx queries in a set of ny vectors */
-template <class ResultHandler>
+template <class BlockResultHandler>
 void exhaustive_inner_product_blas(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res) {
+        BlockResultHandler& res) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
         return;
@@ -258,14 +259,14 @@ void exhaustive_inner_product_blas(
 
 // distance correction is an operator that can be applied to transform
 // the distances
-template <class ResultHandler>
+template <class BlockResultHandler>
 void exhaustive_L2sqr_blas_default_impl(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res,
+        BlockResultHandler& res,
         const float* y_norms = nullptr) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
@@ -341,14 +342,14 @@ void exhaustive_L2sqr_blas_default_impl(
     }
 }
 
-template <class ResultHandler>
+template <class BlockResultHandler>
 void exhaustive_L2sqr_blas(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res,
+        BlockResultHandler& res,
         const float* y_norms = nullptr) {
     exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
 }
@@ -360,7 +361,7 @@ void exhaustive_L2sqr_blas_cmax_avx2(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
@@ -416,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
             for (int64_t i = i0; i < i1; i++) {
                 float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
 
-                _mm_prefetch(ip_line, _MM_HINT_NTA);
-                _mm_prefetch(ip_line + 16, _MM_HINT_NTA);
+                _mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
+                _mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);
 
                 // constant
                 const __m256 mul_minus2 = _mm256_set1_ps(-2);
@@ -444,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
 
                 // process 16 elements per loop
                 for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
-                    _mm_prefetch(ip_line + 32, _MM_HINT_NTA);
-                    _mm_prefetch(ip_line + 48, _MM_HINT_NTA);
+                    _mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
+                    _mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);
 
                     // load values for norms
                     const __m256 y_norm_0 =
@@ -563,13 +564,13 @@ void exhaustive_L2sqr_blas_cmax_avx2(
 
 // an override if only a single closest point is needed
 template <>
-void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
+void exhaustive_L2sqr_blas<Top1BlockResultHandler<CMax<float, int64_t>>>(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms) {
 #if defined(__AVX2__)
     // use a faster fused kernel if available
@@ -590,28 +591,29 @@ void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
 
     // run the default implementation
     exhaustive_L2sqr_blas_default_impl<
-            SingleBestResultHandler<CMax<float, int64_t>>>(
+            Top1BlockResultHandler<CMax<float, int64_t>>>(
             x, y, d, nx, ny, res, y_norms);
 #else
     // run the default implementation
     exhaustive_L2sqr_blas_default_impl<
-            SingleBestResultHandler<CMax<float, int64_t>>>(
+            Top1BlockResultHandler<CMax<float, int64_t>>>(
             x, y, d, nx, ny, res, y_norms);
 #endif
 }
 
-template <class ResultHandler>
+template <class BlockResultHandler>
 void knn_L2sqr_select(
         const float* x,
         const float* y,
         size_t d,
         size_t nx,
         size_t ny,
-        ResultHandler& res,
+        BlockResultHandler& res,
         const float* y_norm2,
         const IDSelector* sel) {
     if (sel) {
-        exhaustive_L2sqr_seq<ResultHandler, true>(x, y, d, nx, ny, res, sel);
+        exhaustive_L2sqr_seq<BlockResultHandler, true>(
+                x, y, d, nx, ny, res, sel);
     } else if (nx < distance_compute_blas_threshold) {
         exhaustive_L2sqr_seq(x, y, d, nx, ny, res);
     } else {
@@ -619,6 +621,25 @@ void knn_L2sqr_select(
     }
 }
 
+template <class BlockResultHandler>
+void knn_inner_product_select(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        BlockResultHandler& res,
+        const IDSelector* sel) {
+    if (sel) {
+        exhaustive_inner_product_seq<BlockResultHandler, true>(
+                x, y, d, nx, ny, res, sel);
+    } else if (nx < distance_compute_blas_threshold) {
+        exhaustive_inner_product_seq(x, y, d, nx, ny, res);
+    } else {
+        exhaustive_inner_product_blas(x, y, d, nx, ny, res);
+    }
+}
+
 } // anonymous namespace
 
 /*******************************************************
@@ -637,7 +658,7 @@ void knn_inner_product(
         size_t nx,
         size_t ny,
         size_t k,
-        float* val,
+        float* vals,
         int64_t* ids,
         const IDSelector* sel) {
     int64_t imin = 0;
@@ -650,30 +671,21 @@ void knn_inner_product(
     }
     if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
         knn_inner_products_by_idx(
-                x, y, sela->ids, d, nx, sela->n, k, val, ids, 0);
+                x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
         return;
     }
-    if (k < distance_compute_min_k_reservoir) {
-        using RH = HeapResultHandler<CMin<float, int64_t>>;
-        RH res(nx, val, ids, k);
-        if (sel) {
-            exhaustive_inner_product_seq<RH, true>(x, y, d, nx, ny, res, sel);
-        } else if (nx < distance_compute_blas_threshold) {
-            exhaustive_inner_product_seq(x, y, d, nx, ny, res);
-        } else {
-            exhaustive_inner_product_blas(x, y, d, nx, ny, res);
-        }
+
+    if (k == 1) {
+        Top1BlockResultHandler<CMin<float, int64_t>> res(nx, vals, ids);
+        knn_inner_product_select(x, y, d, nx, ny, res, sel);
+    } else if (k < distance_compute_min_k_reservoir) {
+        HeapBlockResultHandler<CMin<float, int64_t>> res(nx, vals, ids, k);
+        knn_inner_product_select(x, y, d, nx, ny, res, sel);
     } else {
-        using RH = ReservoirResultHandler<CMin<float, int64_t>>;
-        RH res(nx, val, ids, k);
-        if (sel) {
-            exhaustive_inner_product_seq<RH, true>(x, y, d, nx, ny, res, sel);
-        } else if (nx < distance_compute_blas_threshold) {
-            exhaustive_inner_product_seq(x, y, d, nx, ny, res, nullptr);
-        } else {
-            exhaustive_inner_product_blas(x, y, d, nx, ny, res);
-        }
+        ReservoirBlockResultHandler<CMin<float, int64_t>> res(nx, vals, ids, k);
+        knn_inner_product_select(x, y, d, nx, ny, res, sel);
     }
+
     if (imin != 0) {
         for (size_t i = 0; i < nx * k; i++) {
             if (ids[i] >= 0) {
@@ -715,17 +727,17 @@ void knn_L2sqr(
         sel = nullptr;
     }
     if (auto sela = dynamic_cast<const IDSelectorArray*>(sel)) {
-        knn_L2sqr_by_idx(x, y, sela->ids, d, nx, sela->n, k, vals, ids, 0);
+        knn_L2sqr_by_idx(x, y, sela->ids, d, nx, ny, sela->n, k, vals, ids, 0);
         return;
     }
     if (k == 1) {
-        SingleBestResultHandler<CMax<float, int64_t>> res(nx, vals, ids);
+        Top1BlockResultHandler<CMax<float, int64_t>> res(nx, vals, ids);
         knn_L2sqr_select(x, y, d, nx, ny, res, y_norm2, sel);
     } else if (k < distance_compute_min_k_reservoir) {
-        HeapResultHandler<CMax<float, int64_t>> res(nx, vals, ids, k);
+        HeapBlockResultHandler<CMax<float, int64_t>> res(nx, vals, ids, k);
         knn_L2sqr_select(x, y, d, nx, ny, res, y_norm2, sel);
     } else {
-        ReservoirResultHandler<CMax<float, int64_t>> res(nx, vals, ids, k);
+        ReservoirBlockResultHandler<CMax<float, int64_t>> res(nx, vals, ids, k);
         knn_L2sqr_select(x, y, d, nx, ny, res, y_norm2, sel);
     }
     if (imin != 0) {
@@ -763,7 +775,7 @@ void range_search_L2sqr(
         float radius,
         RangeSearchResult* res,
         const IDSelector* sel) {
-    using RH = RangeSearchResultHandler<CMax<float, int64_t>>;
+    using RH = RangeSearchBlockResultHandler<CMax<float, int64_t>>;
     RH resh(res, radius);
     if (sel) {
         exhaustive_L2sqr_seq<RH, true>(x, y, d, nx, ny, resh, sel);
@@ -783,7 +795,7 @@ void range_search_inner_product(
         float radius,
         RangeSearchResult* res,
         const IDSelector* sel) {
-    using RH = RangeSearchResultHandler<CMin<float, int64_t>>;
+    using RH = RangeSearchBlockResultHandler<CMin<float, int64_t>>;
     RH resh(res, radius);
     if (sel) {
         exhaustive_inner_product_seq<RH, true>(x, y, d, nx, ny, resh, sel);
@@ -893,6 +905,7 @@ void knn_inner_products_by_idx(
         size_t d,
         size_t nx,
         size_t ny,
+        size_t nsubset,
         size_t k,
         float* res_vals,
         int64_t* res_ids,
@@ -910,9 +923,10 @@ void knn_inner_products_by_idx(
         int64_t* __restrict idxi = res_ids + i * k;
         minheap_heapify(k, simi, idxi);
 
-        for (j = 0; j < ny; j++) {
-            if (idsi[j] < 0)
+        for (j = 0; j < nsubset; j++) {
+            if (idsi[j] < 0 || idsi[j] >= ny) {
                 break;
+            }
             float ip = fvec_inner_product(x_, y + d * idsi[j], d);
 
             if (ip > simi[0]) {
@@ -930,6 +944,7 @@ void knn_L2sqr_by_idx(
         size_t d,
         size_t nx,
         size_t ny,
+        size_t nsubset,
         size_t k,
         float* res_vals,
         int64_t* res_ids,
@@ -944,7 +959,10 @@ void knn_L2sqr_by_idx(
         float* __restrict simi = res_vals + i * k;
         int64_t* __restrict idxi = res_ids + i * k;
         maxheap_heapify(k, simi, idxi);
-        for (size_t j = 0; j < ny; j++) {
+        for (size_t j = 0; j < nsubset; j++) {
+            if (idsi[j] < 0 || idsi[j] >= ny) {
+                break;
+            }
             float disij = fvec_L2sqr(x_, y + d * idsi[j], d);
 
             if (disij < simi[0]) {
diff --git a/faiss/utils/distances.h b/faiss/utils/distances.h
index 898edeeb0e..c868581bb0 100644
--- a/faiss/utils/distances.h
+++ b/faiss/utils/distances.h
@@ -376,6 +376,7 @@ void knn_inner_products_by_idx(
         const int64_t* subset,
         size_t d,
         size_t nx,
+        size_t ny,
         size_t nsubset,
         size_t k,
         float* vals,
@@ -398,6 +399,7 @@ void knn_L2sqr_by_idx(
         const int64_t* subset,
         size_t d,
         size_t nx,
+        size_t ny,
         size_t nsubset,
         size_t k,
         float* vals,
diff --git a/faiss/utils/distances_fused/avx512.cpp b/faiss/utils/distances_fused/avx512.cpp
index b5ff70f9e4..d4c442c79b 100644
--- a/faiss/utils/distances_fused/avx512.cpp
+++ b/faiss/utils/distances_fused/avx512.cpp
@@ -68,7 +68,7 @@ void kernel(
         const float* const __restrict y,
         const float* const __restrict y_transposed,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* __restrict y_norms,
         size_t i) {
     const size_t ny_p =
@@ -231,7 +231,7 @@ void exhaustive_L2sqr_fused_cmax(
         const float* const __restrict y,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* __restrict y_norms) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0) {
@@ -275,7 +275,7 @@ void exhaustive_L2sqr_fused_cmax(
                 x, y, y_transposed.data(), ny, res, y_norms, i);
     }
 
-    // Does nothing for SingleBestResultHandler, but
+    // Does nothing for Top1BlockResultHandler, but
     // keeping the call for the consistency.
     res.end_multiple();
     InterruptCallback::check();
@@ -289,7 +289,7 @@ bool exhaustive_L2sqr_fused_cmax_AVX512(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms) {
     // process only cases with certain dimensionalities
 
diff --git a/faiss/utils/distances_fused/avx512.h b/faiss/utils/distances_fused/avx512.h
index b6d5fc0556..4cb62771a2 100644
--- a/faiss/utils/distances_fused/avx512.h
+++ b/faiss/utils/distances_fused/avx512.h
@@ -28,7 +28,7 @@ bool exhaustive_L2sqr_fused_cmax_AVX512(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms);
 
 } // namespace faiss
diff --git a/faiss/utils/distances_fused/distances_fused.cpp b/faiss/utils/distances_fused/distances_fused.cpp
index a0af971c5c..2ba7e29014 100644
--- a/faiss/utils/distances_fused/distances_fused.cpp
+++ b/faiss/utils/distances_fused/distances_fused.cpp
@@ -20,7 +20,7 @@ bool exhaustive_L2sqr_fused_cmax(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms) {
     if (nx == 0 || ny == 0) {
         // nothing to do
diff --git a/faiss/utils/distances_fused/distances_fused.h b/faiss/utils/distances_fused/distances_fused.h
index e6e35c209e..54b58752b1 100644
--- a/faiss/utils/distances_fused/distances_fused.h
+++ b/faiss/utils/distances_fused/distances_fused.h
@@ -34,7 +34,7 @@ bool exhaustive_L2sqr_fused_cmax(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms);
 
 } // namespace faiss
diff --git a/faiss/utils/distances_fused/simdlib_based.cpp b/faiss/utils/distances_fused/simdlib_based.cpp
index 97ededd2f0..309fb72118 100644
--- a/faiss/utils/distances_fused/simdlib_based.cpp
+++ b/faiss/utils/distances_fused/simdlib_based.cpp
@@ -62,7 +62,7 @@ void kernel(
         const float* const __restrict y,
         const float* const __restrict y_transposed,
         const size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* __restrict y_norms,
         const size_t i) {
     const size_t ny_p =
@@ -73,7 +73,7 @@ void kernel(
 
     // prefetch the next point
 #if defined(__AVX2__)
-    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+    _mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
 #endif
 
     // load a single point from x
@@ -226,7 +226,7 @@ void exhaustive_L2sqr_fused_cmax(
         const float* const __restrict y,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* __restrict y_norms) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0) {
@@ -270,7 +270,7 @@ void exhaustive_L2sqr_fused_cmax(
                 x, y, y_transposed.data(), ny, res, y_norms, i);
     }
 
-    // Does nothing for SingleBestResultHandler, but
+    // Does nothing for Top1BlockResultHandler, but
     // keeping the call for the consistency.
     res.end_multiple();
     InterruptCallback::check();
@@ -284,7 +284,7 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms) {
     // Process only cases with certain dimensionalities.
     // An acceptable dimensionality value is limited by the number of
diff --git a/faiss/utils/distances_fused/simdlib_based.h b/faiss/utils/distances_fused/simdlib_based.h
index b60da7b193..6240a8f110 100644
--- a/faiss/utils/distances_fused/simdlib_based.h
+++ b/faiss/utils/distances_fused/simdlib_based.h
@@ -24,7 +24,7 @@ bool exhaustive_L2sqr_fused_cmax_simdlib(
         size_t d,
         size_t nx,
         size_t ny,
-        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        Top1BlockResultHandler<CMax<float, int64_t>>& res,
         const float* y_norms);
 
 } // namespace faiss
diff --git a/faiss/utils/distances_simd.cpp b/faiss/utils/distances_simd.cpp
index 953e5b7763..323859f43b 100644
--- a/faiss/utils/distances_simd.cpp
+++ b/faiss/utils/distances_simd.cpp
@@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(
 
     if (ny8 > 0) {
         // process 8 D2-vectors per loop.
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         const __m256 m0 = _mm256_set1_ps(x[0]);
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             // load 8x2 matrix and transpose it in registers.
             // the typical bottleneck is memory access, so
@@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(
 
     if (ny8 > 0) {
         // process 8 D2-vectors per loop.
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         const __m256 m0 = _mm256_set1_ps(x[0]);
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             // load 8x2 matrix and transpose it in registers.
             // the typical bottleneck is memory access, so
@@ -969,7 +969,6 @@ void fvec_L2sqr_ny_y_transposed_D(
 
     // squared length of x
     float x_sqlen = 0;
-    ;
     for (size_t j = 0; j < DIM; j++) {
         x_sqlen += x[j] * x[j];
     }
@@ -1085,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
     // process 8 D2-vectors per loop.
     const size_t ny8 = ny / 8;
     if (ny8 > 0) {
-        _mm_prefetch(y, _MM_HINT_T0);
-        _mm_prefetch(y + 16, _MM_HINT_T0);
+        _mm_prefetch((const char*)y, _MM_HINT_T0);
+        _mm_prefetch((const char*)(y + 16), _MM_HINT_T0);
 
         // track min distance and the closest vector independently
         // for each of 8 AVX2 components.
@@ -1101,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
         const __m256 m1 = _mm256_set1_ps(x[1]);
 
         for (; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_T0);
+            _mm_prefetch((const char*)(y + 32), _MM_HINT_T0);
 
             __m256 v0;
             __m256 v1;
@@ -1631,21 +1630,6 @@ size_t fvec_L2sqr_ny_nearest_y_transposed(
 
 #ifdef USE_AVX
 
-// reads 0 <= d < 8 floats as __m256
-static inline __m256 masked_read_8(int d, const float* x) {
-    assert(0 <= d && d < 8);
-    if (d < 4) {
-        __m256 res = _mm256_setzero_ps();
-        res = _mm256_insertf128_ps(res, masked_read(d, x), 0);
-        return res;
-    } else {
-        __m256 res = _mm256_setzero_ps();
-        res = _mm256_insertf128_ps(res, _mm_loadu_ps(x), 0);
-        res = _mm256_insertf128_ps(res, masked_read(d - 4, x + 4), 1);
-        return res;
-    }
-}
-
 float fvec_L1(const float* x, const float* y, size_t d) {
     __m256 msum1 = _mm256_setzero_ps();
     __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
@@ -1864,7 +1848,7 @@ void fvec_inner_products_ny(
  * heavily optimized table computations
  ***************************************************************************/
 
-static inline void fvec_madd_ref(
+[[maybe_unused]] static inline void fvec_madd_ref(
         size_t n,
         const float* a,
         float bf,
@@ -1931,7 +1915,7 @@ static inline void fvec_madd_avx2(
 
 #ifdef __SSE3__
 
-static inline void fvec_madd_sse(
+[[maybe_unused]] static inline void fvec_madd_sse(
         size_t n,
         const float* a,
         float bf,
diff --git a/faiss/utils/extra_distances-inl.h b/faiss/utils/extra_distances-inl.h
index d3768df668..3171580f8c 100644
--- a/faiss/utils/extra_distances-inl.h
+++ b/faiss/utils/extra_distances-inl.h
@@ -10,6 +10,7 @@
 
 #include <faiss/MetricType.h>
 #include <faiss/utils/distances.h>
+#include <cmath>
 #include <type_traits>
 
 namespace faiss {
@@ -130,4 +131,35 @@ inline float VectorDistance<METRIC_Jaccard>::operator()(
     return accu_num / accu_den;
 }
 
+template <>
+inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
+        const float* x,
+        const float* y) const {
+    // https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
+    float accu = 0;
+    size_t present = 0;
+    for (size_t i = 0; i < d; i++) {
+        if (!std::isnan(x[i]) && !std::isnan(y[i])) {
+            float diff = x[i] - y[i];
+            accu += diff * diff;
+            present++;
+        }
+    }
+    if (present == 0) {
+        return NAN;
+    }
+    return float(d) / float(present) * accu;
+}
+
+template <>
+inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu += fabs(x[i] * y[i]);
+    }
+    return accu;
+}
+
 } // namespace faiss
diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp
index 8c0699880d..407057e58e 100644
--- a/faiss/utils/extra_distances.cpp
+++ b/faiss/utils/extra_distances.cpp
@@ -50,16 +50,18 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD, class C>
+template <class VD>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        HeapArray<C>* res) {
-    size_t k = res->k;
+        size_t k,
+        float* distances,
+        int64_t* labels) {
     size_t d = vd.d;
+    using C = typename VD::C;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
     check_period *= omp_get_max_threads();
 
@@ -71,18 +73,15 @@ void knn_extra_metrics_template(
             const float* x_i = x + i * d;
             const float* y_j = y;
             size_t j;
-            float* simi = res->get_val(i);
-            int64_t* idxi = res->get_ids(i);
+            float* simi = distances + k * i;
+            int64_t* idxi = labels + k * i;
 
             // maxheap_heapify(k, simi, idxi);
             heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
                 float disij = vd(x_i, y_j);
 
-                // if (disij < simi[0]) {
-                if ((!vd.is_similarity && (disij < simi[0])) ||
-                    (vd.is_similarity && (disij > simi[0]))) {
-                    // maxheap_replace_top(k, simi, idxi, disij, j);
+                if (C::cmp(simi[0], disij)) {
                     heap_replace_top<C>(k, simi, idxi, disij, j);
                 }
                 y_j += d;
@@ -164,13 +163,14 @@ void pairwise_extra_distances(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -179,13 +179,15 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res) {
+        size_t k,
+        float* distances,
+        int64_t* indexes) {
     switch (mt) {
-#define HANDLE_VAR(kw)                                            \
-    case METRIC_##kw: {                                           \
-        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res);        \
-        break;                                                    \
+#define HANDLE_VAR(kw)                                                       \
+    case METRIC_##kw: {                                                      \
+        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg};            \
+        knn_extra_metrics_template(vd, x, y, nx, ny, k, distances, indexes); \
+        break;                                                               \
     }
         HANDLE_VAR(L2);
         HANDLE_VAR(L1);
@@ -195,32 +197,14 @@ void knn_extra_metrics(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template void knn_extra_metrics<CMax<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMax<float, int64_t>>* res);
-
-template void knn_extra_metrics<CMin<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMin<float, int64_t>>* res);
-
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -242,6 +226,8 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/faiss/utils/extra_distances.h b/faiss/utils/extra_distances.h
index 79b65bc1e9..f8b47cfba5 100644
--- a/faiss/utils/extra_distances.h
+++ b/faiss/utils/extra_distances.h
@@ -33,7 +33,6 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -42,7 +41,9 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res);
+        size_t k,
+        float* distances,
+        int64_t* indexes);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
diff --git a/faiss/utils/fp16-arm.h b/faiss/utils/fp16-arm.h
new file mode 100644
index 0000000000..79c885b058
--- /dev/null
+++ b/faiss/utils/fp16-arm.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <arm_neon.h>
+#include <cstdint>
+
+namespace faiss {
+
+inline uint16_t encode_fp16(float x) {
+    float32x4_t fx4 = vdupq_n_f32(x);
+    float16x4_t f16x4 = vcvt_f16_f32(fx4);
+    uint16x4_t ui16x4 = vreinterpret_u16_f16(f16x4);
+    return vduph_lane_u16(ui16x4, 3);
+}
+
+inline float decode_fp16(uint16_t x) {
+    uint16x4_t ui16x4 = vdup_n_u16(x);
+    float16x4_t f16x4 = vreinterpret_f16_u16(ui16x4);
+    float32x4_t fx4 = vcvt_f32_f16(f16x4);
+    return vdups_laneq_f32(fx4, 3);
+}
+
+} // namespace faiss
diff --git a/faiss/utils/fp16.h b/faiss/utils/fp16.h
index 90691d8ffe..43e05dc3d3 100644
--- a/faiss/utils/fp16.h
+++ b/faiss/utils/fp16.h
@@ -13,6 +13,8 @@
 
 #if defined(__F16C__)
 #include <faiss/utils/fp16-fp16c.h>
+#elif defined(__aarch64__)
+#include <faiss/utils/fp16-arm.h>
 #else
 #include <faiss/utils/fp16-inl.h>
 #endif
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
index 14b84f7ab6..93acaaf5b4 100644
--- a/faiss/utils/hamming.cpp
+++ b/faiss/utils/hamming.cpp
@@ -35,8 +35,6 @@
 #include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
 #include <faiss/utils/utils.h>
 
-static const size_t BLOCKSIZE_QUERY = 8192;
-
 namespace faiss {
 
 size_t hamming_batch_size = 65536;
@@ -271,10 +269,10 @@ void hammings_knn_mc(
         HCounterState<HammingComputer>& csi = cs[i];
 
         int nres = 0;
-        for (int b = 0; b < nBuckets && nres < k; b++) {
-            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
-                distances[i * k + nres] = b;
+        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
+            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
+                distances[i * k + nres] = b_2;
                 nres++;
             }
         }
diff --git a/faiss/utils/hamming_distance/avx2-inl.h b/faiss/utils/hamming_distance/avx2-inl.h
index fdc746c019..5cdc6a2b46 100644
--- a/faiss/utils/hamming_distance/avx2-inl.h
+++ b/faiss/utils/hamming_distance/avx2-inl.h
@@ -259,8 +259,8 @@ struct HammingComputerDefault {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
+    void set(const uint8_t* a8_2, int code_size) {
+        this->a8 = a8_2;
         quotient8 = code_size / 8;
         remainder8 = code_size % 8;
     }
diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h
index 1607fb5d05..e0907a1586 100644
--- a/faiss/utils/hamming_distance/generic-inl.h
+++ b/faiss/utils/hamming_distance/generic-inl.h
@@ -275,24 +275,31 @@ struct HammingComputerDefault {
                     len -= 8;
                     accu += popcount64(a64[i] ^ b64[i]);
                     i++;
+                    [[fallthrough]];
                     case 7:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 6:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 5:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 4:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 3:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 2:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 1:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
@@ -302,20 +309,28 @@ struct HammingComputerDefault {
             const uint8_t* a = a8 + 8 * quotient8;
             const uint8_t* b = b8 + 8 * quotient8;
             switch (remainder8) {
+                [[fallthrough]];
                 case 7:
                     accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                    [[fallthrough]];
                 case 6:
                     accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                    [[fallthrough]];
                 case 5:
                     accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                    [[fallthrough]];
                 case 4:
                     accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                    [[fallthrough]];
                 case 3:
                     accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                    [[fallthrough]];
                 case 2:
                     accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                    [[fallthrough]];
                 case 1:
                     accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                    [[fallthrough]];
                 default:
                     break;
             }
diff --git a/faiss/utils/hamming_distance/neon-inl.h b/faiss/utils/hamming_distance/neon-inl.h
index d1a0fdee7a..d8a42f7218 100644
--- a/faiss/utils/hamming_distance/neon-inl.h
+++ b/faiss/utils/hamming_distance/neon-inl.h
@@ -260,7 +260,6 @@ struct HammingComputer32 {
     }
 
     inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
         uint8x16_t b0 = vld1q_u8(b8);
         uint8x16_t b1 = vld1q_u8(b8 + 16);
 
@@ -338,24 +337,31 @@ struct HammingComputerDefault {
                     len -= 8;
                     accu += popcount64(a64[i] ^ b64[i]);
                     i++;
+                    [[fallthrough]];
                     case 7:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 6:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 5:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 4:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 3:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 2:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
+                        [[fallthrough]];
                     case 1:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
@@ -367,18 +373,25 @@ struct HammingComputerDefault {
             switch (remainder8) {
                 case 7:
                     accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                    [[fallthrough]];
                 case 6:
                     accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                    [[fallthrough]];
                 case 5:
                     accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                    [[fallthrough]];
                 case 4:
                     accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                    [[fallthrough]];
                 case 3:
                     accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                    [[fallthrough]];
                 case 2:
                     accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                    [[fallthrough]];
                 case 1:
                     accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                    [[fallthrough]];
                 default:
                     break;
             }
diff --git a/faiss/utils/partitioning.cpp b/faiss/utils/partitioning.cpp
index 8d4ee94fb4..4b44126cc7 100644
--- a/faiss/utils/partitioning.cpp
+++ b/faiss/utils/partitioning.cpp
@@ -206,7 +206,8 @@ typename C::T partition_fuzzy_median3(
         assert(n_eq_1 <= n_eq);
     }
 
-    int wp = compress_array<C>(vals, ids, n, thresh, n_eq_1);
+    [[maybe_unused]] const int wp =
+            compress_array<C>(vals, ids, n, thresh, n_eq_1);
 
     assert(wp == q);
     if (q_out) {
diff --git a/faiss/utils/quantize_lut.cpp b/faiss/utils/quantize_lut.cpp
index 642f601d79..ca917e582c 100644
--- a/faiss/utils/quantize_lut.cpp
+++ b/faiss/utils/quantize_lut.cpp
@@ -24,20 +24,6 @@ namespace quantize_lut {
 
 namespace {
 
-float round_uint8_and_mul(float* tab, size_t n) {
-    float max = 0;
-    for (int i = 0; i < n; i++) {
-        if (fabs(tab[i]) > max) {
-            max = fabs(tab[i]);
-        }
-    }
-    float multiplier = 127 / max;
-    for (int i = 0; i < n; i++) {
-        tab[i] = floorf(tab[i] * multiplier + 128);
-    }
-    return multiplier;
-}
-
 // there can be NaNs in tables, they should be ignored
 float tab_min(const float* tab, size_t n) {
     float min = HUGE_VAL;
diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h
index 27e9cc59f5..ea5020d719 100644
--- a/faiss/utils/simdlib.h
+++ b/faiss/utils/simdlib.h
@@ -14,7 +14,12 @@
  * functions.
  */
 
-#ifdef __AVX2__
+#if defined(__AVX512F__)
+
+#include <faiss/utils/simdlib_avx2.h>
+#include <faiss/utils/simdlib_avx512.h>
+
+#elif defined(__AVX2__)
 
 #include <faiss/utils/simdlib_avx2.h>
 
@@ -22,6 +27,10 @@
 
 #include <faiss/utils/simdlib_neon.h>
 
+#elif defined(__PPC64__)
+
+#include <faiss/utils/simdlib_ppc64.h>
+
 #else
 
 // emulated = all operations are implemented as scalars
diff --git a/faiss/utils/simdlib_avx512.h b/faiss/utils/simdlib_avx512.h
new file mode 100644
index 0000000000..9ce0965895
--- /dev/null
+++ b/faiss/utils/simdlib_avx512.h
@@ -0,0 +1,296 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include <immintrin.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/simdlib_avx2.h>
+
+namespace faiss {
+
+/** Simple wrapper around the AVX 512-bit registers
+ *
+ * The objective is to separate the different interpretations of the same
+ * registers (as a vector of uint8, uint16 or uint32), to provide printing
+ * functions, and to give more readable names to the AVX intrinsics. It does not
+ * pretend to be exhausitve, functions are added as needed.
+ */
+
+/// 512-bit representation without interpretation as a vector
+struct simd512bit {
+    union {
+        __m512i i;
+        __m512 f;
+    };
+
+    simd512bit() {}
+
+    explicit simd512bit(__m512i i) : i(i) {}
+
+    explicit simd512bit(__m512 f) : f(f) {}
+
+    explicit simd512bit(const void* x)
+            : i(_mm512_loadu_si512((__m512i const*)x)) {}
+
+    // sets up a lower half of the register while keeping upper one as zero
+    explicit simd512bit(simd256bit lo)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      _mm256_setzero_si256(),
+                      1)) {}
+
+    // constructs from lower and upper halves
+    explicit simd512bit(simd256bit lo, simd256bit hi)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      hi.i,
+                      1)) {}
+
+    void clear() {
+        i = _mm512_setzero_si512();
+    }
+
+    void storeu(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void loadu(const void* ptr) {
+        i = _mm512_loadu_si512((__m512i*)ptr);
+    }
+
+    void store(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void bin(char bits[513]) const {
+        char bytes[64];
+        storeu((void*)bytes);
+        for (int i = 0; i < 512; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[512] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+};
+
+/// vector of 32 elements in uint16
+struct simd32uint16 : simd512bit {
+    simd32uint16() {}
+
+    explicit simd32uint16(__m512i i) : simd512bit(i) {}
+
+    explicit simd32uint16(int x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(uint16_t x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(simd512bit x) : simd512bit(x) {}
+
+    explicit simd32uint16(const uint16_t* x) : simd512bit((const void*)x) {}
+
+    // sets up a lower half of the register
+    explicit simd32uint16(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd32uint16(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint16_t bytes[32];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint16_t x) {
+        i = _mm512_set1_epi16((short)x);
+    }
+
+    simd32uint16 operator*(const simd32uint16& other) const {
+        return simd32uint16(_mm512_mullo_epi16(i, other.i));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator>>(const int shift) const {
+        return simd32uint16(_mm512_srli_epi16(i, shift));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator<<(const int shift) const {
+        return simd32uint16(_mm512_slli_epi16(i, shift));
+    }
+
+    simd32uint16 operator+=(simd32uint16 other) {
+        i = _mm512_add_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator-=(simd32uint16 other) {
+        i = _mm512_sub_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator+(simd32uint16 other) const {
+        return simd32uint16(_mm512_add_epi16(i, other.i));
+    }
+
+    simd32uint16 operator-(simd32uint16 other) const {
+        return simd32uint16(_mm512_sub_epi16(i, other.i));
+    }
+
+    simd32uint16 operator&(simd512bit other) const {
+        return simd32uint16(_mm512_and_si512(i, other.i));
+    }
+
+    simd32uint16 operator|(simd512bit other) const {
+        return simd32uint16(_mm512_or_si512(i, other.i));
+    }
+
+    simd32uint16 operator^(simd512bit other) const {
+        return simd32uint16(_mm512_xor_si512(i, other.i));
+    }
+
+    simd32uint16 operator~() const {
+        return simd32uint16(_mm512_xor_si512(i, _mm512_set1_epi32(-1)));
+    }
+
+    simd16uint16 low() const {
+        return simd16uint16(_mm512_castsi512_si256(i));
+    }
+
+    simd16uint16 high() const {
+        return simd16uint16(_mm512_extracti32x8_epi32(i, 1));
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        ALIGNED(64) uint16_t tab[32];
+        store(tab);
+        return tab[i];
+    }
+
+    void accu_min(simd32uint16 incoming) {
+        i = _mm512_min_epu16(i, incoming.i);
+    }
+
+    void accu_max(simd32uint16 incoming) {
+        i = _mm512_max_epu16(i, incoming.i);
+    }
+};
+
+// decompose in 128-lanes: a = (a0, a1, a2, a3), b = (b0, b1, b2, b3)
+// return (a0 + a1 + a2 + a3, b0 + b1 + b2 + b3)
+inline simd16uint16 combine4x2(simd32uint16 a, simd32uint16 b) {
+    return combine2x2(a.low(), b.low()) + combine2x2(a.high(), b.high());
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd64uint8 : simd512bit {
+    simd64uint8() {}
+
+    explicit simd64uint8(__m512i i) : simd512bit(i) {}
+
+    explicit simd64uint8(int x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    explicit simd64uint8(uint8_t x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    // sets up a lower half of the register
+    explicit simd64uint8(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd64uint8(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    explicit simd64uint8(simd512bit x) : simd512bit(x) {}
+
+    explicit simd64uint8(const uint8_t* x) : simd512bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint8_t bytes[64];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 64; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        i = _mm512_set1_epi8((char)x);
+    }
+
+    simd64uint8 operator&(simd512bit other) const {
+        return simd64uint8(_mm512_and_si512(i, other.i));
+    }
+
+    simd64uint8 operator+(simd64uint8 other) const {
+        return simd64uint8(_mm512_add_epi8(i, other.i));
+    }
+
+    simd64uint8 lookup_4_lanes(simd64uint8 idx) const {
+        return simd64uint8(_mm512_shuffle_epi8(i, idx.i));
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+    simd32uint16 lane0_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 0);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd32uint16 lane1_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 1);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd64uint8 operator+=(simd64uint8 other) {
+        i = _mm512_add_epi8(i, other.i);
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        ALIGNED(64) uint8_t tab[64];
+        store(tab);
+        return tab[i];
+    }
+};
+
+} // namespace faiss
diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simdlib_neon.h
index 656a561217..1bdf0ed01e 100644
--- a/faiss/utils/simdlib_neon.h
+++ b/faiss/utils/simdlib_neon.h
@@ -168,9 +168,12 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     simd.store(bytes);
     char res[1000], *ptr = res;
     for (size_t i = 0; i < N; ++i) {
-        ptr += sprintf(ptr, fmt, bytes[i]);
+        int bytesWritten =
+                snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
+        ptr += bytesWritten;
     }
-    // strip last ,
+    // The format usually contains a ',' separator so this is to remove the last
+    // separator.
     ptr[-1] = 0;
     return std::string(res);
 }
diff --git a/faiss/utils/simdlib_ppc64.h b/faiss/utils/simdlib_ppc64.h
new file mode 100644
index 0000000000..94b3e42dc7
--- /dev/null
+++ b/faiss/utils/simdlib_ppc64.h
@@ -0,0 +1,1084 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace faiss {
+
+struct simd256bit {
+    union {
+        uint8_t u8[32];
+        uint16_t u16[16];
+        uint32_t u32[8];
+        float f32[8];
+    };
+
+    simd256bit() {}
+
+    explicit simd256bit(const void* x) {
+        memcpy(u8, x, 32);
+    }
+
+    void clear() {
+        memset(u8, 0, 32);
+    }
+
+    void storeu(void* ptr) const {
+        memcpy(ptr, u8, 32);
+    }
+
+    void loadu(const void* ptr) {
+        memcpy(u8, ptr, 32);
+    }
+
+    void store(void* ptr) const {
+        storeu(ptr);
+    }
+
+    void bin(char bits[257]) const {
+        const char* bytes = (char*)this->u8;
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+};
+
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+
+    explicit simd16uint16(int x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(uint16_t x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, u16[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    template <typename F>
+    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j]);
+        }
+        return c;
+    }
+
+    template <typename F>
+    static simd16uint16 binary_func(
+            const simd16uint16& a,
+            const simd16uint16& b,
+            F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j], b.u16[j]);
+        }
+        return c;
+    }
+
+    void set1(uint16_t x) {
+        for (int i = 0; i < 16; i++) {
+            u16[i] = x;
+        }
+    }
+
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
+    }
+
+    simd16uint16 operator+=(const simd16uint16& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    simd16uint16 operator-=(const simd16uint16& other) {
+        *this = *this - other;
+        return *this;
+    }
+
+    simd16uint16 operator+(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
+    }
+
+    simd16uint16 operator-(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
+    }
+
+    simd16uint16 operator&(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a & b;
+                });
+    }
+
+    simd16uint16 operator|(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a | b;
+                });
+    }
+
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+
+    // returns binary masks
+    simd16uint16 operator==(const simd16uint16& other) const {
+        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
+            return a == b ? 0xffff : 0;
+        });
+    }
+
+    simd16uint16 operator~() const {
+        return unary_func(*this, [](uint16_t a) { return ~a; });
+    }
+
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return u16[0];
+    }
+
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(const simd16uint16& thresh) const {
+        uint32_t gem = 0;
+        for (int j = 0; j < 16; j++) {
+            if (u16[j] >= thresh.u16[j]) {
+                gem |= 3 << (j * 2);
+            }
+        }
+        return gem;
+    }
+
+    uint32_t le_mask(const simd16uint16& thresh) const {
+        return thresh.ge_mask(*this);
+    }
+
+    uint32_t gt_mask(const simd16uint16& thresh) const {
+        return ~le_mask(thresh);
+    }
+
+    bool all_gt(const simd16uint16& thresh) const {
+        return le_mask(thresh) == 0;
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        return u16[i];
+    }
+
+    void accu_min(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] < u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+
+    void accu_max(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] > u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+};
+
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
+}
+
+inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
+}
+
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    for (int j = 0; j < 8; j++) {
+        c.u16[j] = a.u16[j] + a.u16[j + 8];
+        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
+    }
+    return c;
+}
+
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] >= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] >= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+inline uint32_t cmp_le32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] <= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] <= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+
+    return c;
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+
+    explicit simd32uint8(int x) {
+        set1(x);
+    }
+
+    explicit simd32uint8(uint8_t x) {
+        set1(x);
+    }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
+
+    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, u8[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        for (int j = 0; j < 32; j++) {
+            u8[j] = x;
+        }
+    }
+
+    template <typename F>
+    static simd32uint8 binary_func(
+            const simd32uint8& a,
+            const simd32uint8& b,
+            F&& f) {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            c.u8[j] = f(a.u8[j], b.u8[j]);
+        }
+        return c;
+    }
+
+    simd32uint8 operator&(const simd256bit& other) const {
+        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
+            return a & b;
+        });
+    }
+
+    simd32uint8 operator+(const simd32uint8& other) const {
+        return binary_func(
+                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
+    }
+
+    // The very important operation that everything relies on
+    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
+        simd32uint8 c;
+        // The original for loop:
+        // for (int j = 0; j < 32; j++) {
+        //     if (idx.u8[j] & 0x80) {
+        //         c.u8[j] = 0;
+        //     } else {
+        //         uint8_t i = idx.u8[j] & 15;
+        //         if (j < 16) {
+        //             c.u8[j] = u8[i];
+        //         } else {
+        //             c.u8[j] = u8[16 + i];
+        //         }
+        //     }
+
+        // The following function was re-written for Power 10
+        // The loop was unrolled to remove the if (j < 16) statement by doing
+        // the j and j + 16 iterations in parallel.  The additional unrolling
+        // for j + 1 and j + 17, reduces the execution time on Power 10 by
+        // about 50% as the instruction scheduling allows on average 2X more
+        // instructions to be issued per cycle.
+
+        for (int j = 0; j < 16; j = j + 2) {
+            // j < 16, unrolled to depth of 2
+            if (idx.u8[j] & 0x80) {
+                c.u8[j] = 0;
+            } else {
+                uint8_t i = idx.u8[j] & 15;
+                c.u8[j] = u8[i];
+            }
+
+            if (idx.u8[j + 1] & 0x80) {
+                c.u8[j + 1] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 1] & 15;
+                c.u8[j + 1] = u8[i];
+            }
+
+            // j >= 16, unrolled to depth of 2
+            if (idx.u8[j + 16] & 0x80) {
+                c.u8[j + 16] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 16] & 15;
+                c.u8[j + 16] = u8[i + 16];
+            }
+
+            if (idx.u8[j + 17] & 0x80) {
+                c.u8[j + 17] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 17] & 15;
+                c.u8[j + 17] = u8[i + 16];
+            }
+        }
+        return c;
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+
+    simd32uint8 operator+=(const simd32uint8& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        return u8[i];
+    }
+};
+
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(
+        const simd16uint16& a,
+        const simd16uint16& b) {
+    simd32uint8 c;
+
+    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
+
+    for (int i = 0; i < 8; i++) {
+        c.u8[i] = saturate_16_to_8(a.u16[i]);
+        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
+        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
+        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
+    }
+    return c;
+}
+
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(const simd32uint8& a) {
+    uint32_t res = 0;
+    for (int i = 0; i < 32; i++) {
+        if (a.u8[i] & 0x80) {
+            res |= 1 << i;
+        }
+    }
+    return res;
+}
+
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(
+        const simd32uint8& a,
+        const simd32uint8& b,
+        const simd32uint8& mask) {
+    simd32uint8 c;
+    for (int i = 0; i < 32; i++) {
+        if (mask.u8[i] & 0x80) {
+            c.u8[i] = b.u8[i];
+        } else {
+            c.u8[i] = a.u8[i];
+        }
+    }
+    return c;
+}
+
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+
+    explicit simd8uint32(uint32_t x) {
+        set1(x);
+    }
+
+    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, u32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+
+    void set1(uint32_t x) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] = x;
+        }
+    }
+
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
+};
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+
+    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8float32(float x) {
+        set1(x);
+    }
+
+    explicit simd8float32(const float* x) {
+        loadu((void*)x);
+    }
+
+    void set1(float x) {
+        for (int i = 0; i < 8; i++) {
+            f32[i] = x;
+        }
+    }
+
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+
+    template <typename F>
+    static simd8float32 binary_func(
+            const simd8float32& a,
+            const simd8float32& b,
+            F&& f) {
+        simd8float32 c;
+        for (int j = 0; j < 8; j++) {
+            c.f32[j] = f(a.f32[j], b.f32[j]);
+        }
+        return c;
+    }
+
+    simd8float32 operator*(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a * b; });
+    }
+
+    simd8float32 operator+(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a + b; });
+    }
+
+    simd8float32 operator-(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a - b; });
+    }
+
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
+    std::string tostring() const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", f32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
+// hadd does not cross lanes
+inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0] + a.f32[1];
+    c.f32[1] = a.f32[2] + a.f32[3];
+    c.f32[2] = b.f32[0] + b.f32[1];
+    c.f32[3] = b.f32[2] + b.f32[3];
+
+    c.f32[4] = a.f32[4] + a.f32[5];
+    c.f32[5] = a.f32[6] + a.f32[7];
+    c.f32[6] = b.f32[4] + b.f32[5];
+    c.f32[7] = b.f32[6] + b.f32[7];
+
+    return c;
+}
+
+inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = b.f32[0];
+    c.f32[2] = a.f32[1];
+    c.f32[3] = b.f32[1];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = b.f32[4];
+    c.f32[6] = a.f32[5];
+    c.f32[7] = b.f32[5];
+
+    return c;
+}
+
+inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[2];
+    c.f32[1] = b.f32[2];
+    c.f32[2] = a.f32[3];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[6];
+    c.f32[5] = b.f32[6];
+    c.f32[6] = a.f32[7];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// compute a * b + c
+inline simd8float32 fmadd(
+        const simd8float32& a,
+        const simd8float32& b,
+        const simd8float32& c) {
+    simd8float32 res;
+    for (int i = 0; i < 8; i++) {
+        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
+    }
+    return res;
+}
+
+namespace {
+
+// get even float32's of a and b, interleaved
+simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[2];
+    c.f32[2] = b.f32[0];
+    c.f32[3] = b.f32[2];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = a.f32[6];
+    c.f32[6] = b.f32[4];
+    c.f32[7] = b.f32[6];
+
+    return c;
+}
+
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[1];
+    c.f32[1] = a.f32[3];
+    c.f32[2] = b.f32[1];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[5];
+    c.f32[5] = a.f32[7];
+    c.f32[6] = b.f32[5];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[1];
+    c.f32[2] = a.f32[2];
+    c.f32[3] = a.f32[3];
+
+    c.f32[4] = b.f32[0];
+    c.f32[5] = b.f32[1];
+    c.f32[6] = b.f32[2];
+    c.f32[7] = b.f32[3];
+
+    return c;
+}
+
+simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[4];
+    c.f32[1] = a.f32[5];
+    c.f32[2] = a.f32[6];
+    c.f32[3] = a.f32[7];
+
+    c.f32[4] = b.f32[4];
+    c.f32[5] = b.f32[5];
+    c.f32[6] = b.f32[6];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+} // namespace
+
+} // namespace faiss
diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp
index 76fa5e6201..f8ed250ddb 100644
--- a/faiss/utils/sorting.cpp
+++ b/faiss/utils/sorting.cpp
@@ -544,7 +544,6 @@ void bucket_sort_inplace_parallel(
 
         // in this loop, we write elements collected in the previous round
         // and collect the elements that are overwritten for the next round
-        size_t tot_written = 0;
         int round = 0;
         for (;;) {
 #pragma omp barrier
@@ -554,9 +553,6 @@ void bucket_sort_inplace_parallel(
                 n_to_write += to_write_2.lims.back();
             }
 
-            tot_written += n_to_write;
-            // assert(tot_written <= nval);
-
 #pragma omp master
             {
                 if (verbose >= 1) {
@@ -795,7 +791,6 @@ void hashtable_int64_to_int64_lookup(
     std::vector<int64_t> hk(n), bucket_no(n);
     int64_t mask = capacity - 1;
     int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
-    size_t nbucket = (size_t)1 << log2_nbucket;
 
 #pragma omp parallel for
     for (int64_t i = 0; i < n; i++) {
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
index efbff502b0..dc6faddaf5 100644
--- a/faiss/utils/utils.cpp
+++ b/faiss/utils/utils.cpp
@@ -117,7 +117,7 @@ std::string get_compile_options() {
 #ifdef __AVX2__
     options += "AVX2 ";
 #elif __AVX512F__
-    options += "AVX512";
+    options += "AVX512 ";
 #elif defined(__aarch64__)
     options += "NEON ";
 #else
@@ -582,9 +582,9 @@ int64_t count_gt(int64_t n, const T* row, T threshold) {
 } // namespace
 
 template <typename T>
-void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res) {
-    this->L_res = L_res;
-    L_res[0] = 0;
+void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res_2) {
+    this->L_res = L_res_2;
+    L_res_2[0] = 0;
     int64_t j = 0;
     for (int64_t i = 0; i < nq; i++) {
         int64_t n_in;
@@ -595,11 +595,11 @@ void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res) {
             n_in = lim_remain[j + 1] - lim_remain[j];
             j++;
         }
-        L_res[i + 1] = n_in; // L_res[i] + n_in;
+        L_res_2[i + 1] = n_in; // L_res_2[i] + n_in;
     }
     // cumsum
     for (int64_t i = 0; i < nq; i++) {
-        L_res[i + 1] += L_res[i];
+        L_res_2[i + 1] += L_res_2[i];
     }
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cc0a4f4cfd..3980d7dd7c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,6 +10,7 @@ set(FAISS_TEST_SRC
   test_ivfpq_codec.cpp
   test_ivfpq_indexing.cpp
   test_lowlevel_ivf.cpp
+  test_ivf_index.cpp
   test_merge.cpp
   test_omp_threads.cpp
   test_ondisk_ivf.cpp
@@ -30,6 +31,10 @@ set(FAISS_TEST_SRC
   test_code_distance.cpp
   test_hnsw.cpp
   test_partitioning.cpp
+  test_fastscan_perf.cpp
+  test_disable_pq_sdc_tables.cpp
+  test_common_ivf_empty_index.cpp
+  test_callback.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
@@ -57,17 +62,40 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
 endif()
 
 include(FetchContent)
-FetchContent_Declare(googletest
-  URL "https://github.com/google/googletest/archive/release-1.12.1.tar.gz")
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG 58d77fa8070e8cec2dc1ed015d66b454c8d78850 # release-1.12.1
+  OVERRIDE_FIND_PACKAGE)
 set(BUILD_GMOCK CACHE BOOL OFF)
 set(INSTALL_GTEST CACHE BOOL OFF)
 FetchContent_MakeAvailable(googletest)
 
+if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+   AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
+  file(
+    WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+    [=[
+include(CMakeFindDependencyMacro)
+find_dependency(googletest)
+if(NOT TARGET GTest::GTest)
+  add_library(GTest::GTest INTERFACE IMPORTED)
+  target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
+endif()
+if(NOT TARGET GTest::Main)
+  add_library(GTest::Main INTERFACE IMPORTED)
+  target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
+endif()
+]=])
+endif()
+
 find_package(OpenMP REQUIRED)
+find_package(GTest CONFIG REQUIRED)
 
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
-  gtest_main
+  GTest::gtest_main
+  $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>
 )
 
 # Defines `gtest_discover_tests()`.
diff --git a/tests/common_faiss_tests.py b/tests/common_faiss_tests.py
index 8dc25edec0..a8afe344e4 100644
--- a/tests/common_faiss_tests.py
+++ b/tests/common_faiss_tests.py
@@ -49,7 +49,6 @@ def evalres(self, DI):
         for rank in 1, 10, 100:
             e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
                        float(self.nq))
-        # print("1-recalls: %s" % e)
         return e
 
 
diff --git a/tests/test_RCQ_cropping.cpp b/tests/test_RCQ_cropping.cpp
index 4dd3470885..4463c256ed 100644
--- a/tests/test_RCQ_cropping.cpp
+++ b/tests/test_RCQ_cropping.cpp
@@ -28,7 +28,6 @@ TEST(RCQCropping, test_cropping) {
     faiss::ResidualCoarseQuantizer rcq(d, nbits);
 
     rcq.train(nt, xt);
-    // fprintf(stderr, "nb centroids: %zd\n", rcq.ntotal);
 
     // the test below works only for beam size == nprobe
     rcq.set_beam_factor(1.0);
@@ -44,7 +43,6 @@ TEST(RCQCropping, test_cropping) {
     nbits.pop_back();
     faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
     rcq_cropped.initialize_from(rcq);
-    // fprintf(stderr, "cropped nb centroids: %zd\n", rcq_cropped.ntotal);
 
     EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
 
diff --git a/tests/test_binary_hashindex.py b/tests/test_binary_hashindex.py
index 2d33050571..e9a6eaca49 100644
--- a/tests/test_binary_hashindex.py
+++ b/tests/test_binary_hashindex.py
@@ -58,8 +58,6 @@ def test_hash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         index = faiss.IndexBinaryHash(d, 10)
         index.add(xb)
         # index.display()
@@ -80,8 +78,6 @@ def test_hash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -100,8 +96,6 @@ def test_multihash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         nfound = []
         ndis = []
 
@@ -123,8 +117,6 @@ def test_multihash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         # self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -163,7 +155,6 @@ def test_hash_and_multihash(self):
                     # no duplicates
                     self.assertTrue(len(new) == len(snew))
                     nf += len(set(ref) & snew)
-                print('nfound', nh, nbit, nf)
                 nfound[(nh, nbit)] = nf
             self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
 
@@ -175,7 +166,6 @@ def test_hash_and_multihash(self):
             np.testing.assert_array_equal(Inew, I2)
             np.testing.assert_array_equal(Dnew, D2)
 
-        print('nfound=', nfound)
         self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
         self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
         self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py
index 0a97e63185..fdf9ad8bd7 100644
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -189,7 +189,6 @@ def test_l2(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = ((x - y) ** 2).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -204,7 +203,6 @@ def test_IP(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = (x * y).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -220,7 +218,6 @@ def test_0s(self):
         m = rs.rand(40, 20).astype('float32')
         m[5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'has 5 copies' in comments
         assert '5 null vectors' in comments
 
@@ -229,7 +226,6 @@ def test_copies(self):
         m = rs.rand(40, 20).astype('float32')
         m[::2] = m[1::2]
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '20 vectors are distinct' in comments
 
     def test_dead_dims(self):
@@ -237,7 +233,6 @@ def test_dead_dims(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are constant' in comments
 
     def test_rogue_means(self):
@@ -245,7 +240,6 @@ def test_rogue_means(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] += 12345
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are too large wrt. their variance' in comments
 
     def test_normalized(self):
@@ -253,7 +247,6 @@ def test_normalized(self):
         m = rs.rand(40, 20).astype('float32')
         faiss.normalize_L2(m)
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'vectors are normalized' in comments
 
     def test_hash(self):
@@ -300,7 +293,6 @@ def test_8bit_equiv(self):
                 D, I = index.search(x[3:], 1)
 
                 # assert D[0, 0] == Dref[0, 0]
-                # print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
                 assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
 
     def test_6bit_equiv(self):
@@ -314,8 +306,6 @@ def test_6bit_equiv(self):
                 d, faiss.ScalarQuantizer.QT_6bit)
             index.train(trainset)
 
-            print('cs=', index.code_size)
-
             x = rs.randint(64, size=(100, d)).astype('float32')
 
             # verify encoder / decoder
@@ -330,7 +320,6 @@ def test_6bit_equiv(self):
             for i in range(20):
                 for j in range(10):
                     dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
-                    # print(dis, D[i, j])
                     assert abs(D[i, j] - dis) / dis < 1e-5
 
     def test_reconstruct(self):
@@ -371,7 +360,6 @@ def test_randint(self):
         x = faiss.randint(20000, vmax=100)
         assert np.all(x >= 0) and np.all(x < 100)
         c = np.bincount(x, minlength=100)
-        print(c)
         assert c.max() - c.min() < 50 * 2
 
     def test_rand_vector(self):
@@ -473,7 +461,6 @@ def do_test_array_type(self, dtype):
         """ tests swig_ptr and rev_swig_ptr for this type of array """
         a = np.arange(12).astype(dtype)
         ptr = faiss.swig_ptr(a)
-        print(ptr)
         a2 = faiss.rev_swig_ptr(ptr, 12)
         np.testing.assert_array_equal(a, a2)
 
@@ -547,7 +534,6 @@ def subtest(self, d, K, metric):
                         recalls += 1
                         break
         recall = 1.0 * recalls / (nb * K)
-        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
         assert recall > 0.99
 
     def test_small_nndescent(self):
@@ -656,7 +642,6 @@ def do_test_bucket_sort_inplace(
             rows, _ = np.where(tab == b)
             rows.sort()
             tab2[lims[b]:lims[b + 1]].sort()
-            # print(rows, tab2[lims[b] : lims[b + 1]])
             rows = set(rows)
             self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
 
diff --git a/tests/test_callback.cpp b/tests/test_callback.cpp
new file mode 100644
index 0000000000..cdfadf1d39
--- /dev/null
+++ b/tests/test_callback.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/random.h>
+
+TEST(TestCallback, timeout) {
+    int n = 1000;
+    int k = 100;
+    int d = 128;
+    int niter = 1000000000;
+    int seed = 42;
+
+    std::vector<float> vecs(n * d);
+    faiss::float_rand(vecs.data(), vecs.size(), seed);
+
+    auto index(new faiss::IndexFlat(d));
+
+    faiss::ClusteringParameters cp;
+    cp.niter = niter;
+    cp.verbose = false;
+
+    faiss::Clustering kmeans(d, k, cp);
+
+    faiss::TimeoutCallback::reset(0.010);
+    EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
+    delete index;
+}
diff --git a/tests/test_callback_py.py b/tests/test_callback_py.py
new file mode 100644
index 0000000000..0ec176dd86
--- /dev/null
+++ b/tests/test_callback_py.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import numpy as np
+import faiss
+
+
+class TestCallbackPy(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+    def test_timeout(self) -> None:
+        n = 1000
+        k = 100
+        d = 128
+        niter = 1_000_000_000
+
+        x = np.random.rand(n, d).astype('float32')
+        index = faiss.IndexFlat(d)
+
+        cp = faiss.ClusteringParameters()
+        cp.niter = niter
+        cp.verbose = False
+
+        kmeans = faiss.Clustering(d, k, cp)
+
+        with self.assertRaises(RuntimeError):
+            with faiss.TimeoutGuard(0.010):
+                kmeans.train(x, index)
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
index 2b81fc3e35..b1afc8523f 100644
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@@ -110,9 +110,6 @@ def test_weighted(self):
         cdis2_first = cdis2[:5].sum()
         cdis2_last = cdis2[5:].sum()
 
-        print(cdis1_first, cdis1_last)
-        print(cdis2_first, cdis2_last)
-
         # with the new clustering, the last should be much (*2) closer
         # to their centroids
         self.assertGreater(cdis1_last, cdis1_first * 2)
diff --git a/tests/test_common_ivf_empty_index.cpp b/tests/test_common_ivf_empty_index.cpp
new file mode 100644
index 0000000000..a3e33031bd
--- /dev/null
+++ b/tests/test_common_ivf_empty_index.cpp
@@ -0,0 +1,144 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <gtest/gtest.h>
+
+#include <omp.h>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_factory.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/random.h>
+
+/* This demonstrates how to query several independent IVF indexes with a trained
+ *index in common. This avoids to duplicate the coarse quantizer and metadata
+ *in memory.
+ **/
+
+namespace {
+
+int d = 64;
+
+} // namespace
+
+std::vector<float> get_random_vectors(size_t n, int seed) {
+    std::vector<float> x(n * d);
+    faiss::rand_smooth_vectors(n, d, x.data(), seed);
+    seed++;
+    return x;
+}
+
+/** InvetedLists implementation that dispatches the search to an InvertedList
+ * object that is passed in at query time */
+
+struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
+    DispatchingInvertedLists(size_t nlist, size_t code_size)
+            : faiss::ReadOnlyInvertedLists(nlist, code_size) {
+        use_iterator = true;
+    }
+
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const override {
+        assert(inverted_list_context);
+        auto il =
+                static_cast<const faiss::InvertedLists*>(inverted_list_context);
+        return il->get_iterator(list_no);
+    }
+
+    using idx_t = faiss::idx_t;
+
+    size_t list_size(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const uint8_t* get_codes(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const idx_t* get_ids(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+};
+
+TEST(COMMON, test_common_trained_index) {
+    int N = 3;    // number of independent indexes
+    int nt = 500; // training vectors
+    int nb = 200; // nb database vectors per index
+    int nq = 10;  // nb queries performed on each index
+    int k = 4;    // restults requested per query
+
+    // construct and build an "empty index": a trained index that does not
+    // itself hold any data
+    std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
+            faiss::index_factory(d, "IVF32,PQ8np")));
+    auto xt = get_random_vectors(nt, 123);
+    empty_index->train(nt, xt.data());
+    empty_index->nprobe = 4;
+
+    // reference run: build one index for each set of db / queries and record
+    // results
+    std::vector<std::vector<faiss::idx_t>> ref_I(N);
+
+    for (int i = 0; i < N; i++) {
+        // clone the empty index
+        std::unique_ptr<faiss::Index> index(
+                faiss::clone_index(empty_index.get()));
+        auto xb = get_random_vectors(nb, 1234 + i);
+        auto xq = get_random_vectors(nq, 12345 + i);
+        // add vectors and perform a search
+        index->add(nb, xb.data());
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+        index->search(nq, xq.data(), k, D.data(), I.data());
+        // record result as reference
+        ref_I[i] = I;
+    }
+
+    // build a set of inverted lists for each independent index
+    std::vector<faiss::ArrayInvertedLists> sub_invlists;
+
+    for (int i = 0; i < N; i++) {
+        // swap in other inverted lists
+        sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
+        faiss::InvertedLists* invlists = &sub_invlists.back();
+
+        // replace_invlists swaps in a new InvertedLists for an existing index
+        empty_index->replace_invlists(invlists, false);
+        empty_index->reset(); // reset id counter to 0
+        // populate inverted lists
+        auto xb = get_random_vectors(nb, 1234 + i);
+        empty_index->add(nb, xb.data());
+    }
+
+    // perform search dispatching to the sub-invlists. At search time, we don't
+    // use replace_invlists because that would wreak havoc in a multithreaded
+    // context
+    DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
+    empty_index->replace_invlists(&di, false);
+
+    std::vector<std::vector<faiss::idx_t>> new_I(N);
+
+    // run searches in the independent indexes but with a common empty_index
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+        auto xq = get_random_vectors(nq, 12345 + i);
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+
+        // here we set to what sub-index the queries should be directed
+        faiss::SearchParametersIVF params;
+        params.nprobe = empty_index->nprobe;
+        params.inverted_list_context = &sub_invlists[i];
+
+        empty_index->search(nq, xq.data(), k, D.data(), I.data(), &params);
+        new_I[i] = I;
+    }
+
+    // compare with reference reslt
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(ref_I[i], new_I[i]);
+    }
+}
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index 36c17792ce..05a2c4ac8b 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -9,6 +9,7 @@
 import platform
 import os
 import random
+import shutil
 import tempfile
 
 from faiss.contrib import datasets
@@ -17,15 +18,13 @@
 from faiss.contrib import ivf_tools
 from faiss.contrib import clustering
 from faiss.contrib import big_batch_search
+from faiss.contrib.ondisk import merge_ondisk
 
 from common_faiss_tests import get_dataset_2
-try:
-    from faiss.contrib.exhaustive_search import \
-        knn_ground_truth, knn, range_ground_truth, \
-        range_search_max_results, exponential_query_iterator
-except:
-    pass  # Submodule import broken in python 2.
-
+from faiss.contrib.exhaustive_search import \
+    knn_ground_truth, knn, range_ground_truth, \
+    range_search_max_results, exponential_query_iterator
+from contextlib import contextmanager
 
 @unittest.skipIf(platform.python_version_tuple()[0] < '3',
                  'Submodule import broken in python 2.')
@@ -148,7 +147,6 @@ def test_query_iterator(self, metric=faiss.METRIC_L2):
         xb = ds.get_database()
         D, I = faiss.knn(xq, xb, 10, metric=metric)
         threshold = float(D[:, -1].mean())
-        print(threshold)
 
         index = faiss.IndexFlat(32, metric)
         index.add(xb)
@@ -252,7 +250,6 @@ def test_precision_recall(self):
         Inew = np.hstack(Inew)
 
         precision, recall = evaluation.range_PR(lims_ref, Iref, lims_new, Inew)
-        print(precision, recall)
 
         self.assertEqual(precision, 0.6)
         self.assertEqual(recall, 0.6)
@@ -306,6 +303,26 @@ def test_PR_multiple(self):
 
 class TestPreassigned(unittest.TestCase):
 
+    def test_index_pretransformed(self):
+
+        ds = datasets.SyntheticDataset(128, 2000, 2000, 200)
+        xt = ds.get_train()
+        xq = ds.get_queries()
+        xb = ds.get_database()
+        index = faiss.index_factory(128, 'PCA64,IVF64,PQ4np')
+        index.train(xt)
+        index.add(xb)
+        index_downcasted = faiss.extract_index_ivf(index)
+        index_downcasted.nprobe = 10
+        xq_trans = index.chain.at(0).apply_py(xq)
+        D_ref, I_ref = index.search(xq, 4)
+
+        quantizer = index_downcasted.quantizer
+        Dq, Iq = quantizer.search(xq_trans, index_downcasted.nprobe)
+        D, I = ivf_tools.search_preassigned(index, xq, 4, Iq, Dq)
+        np.testing.assert_almost_equal(D_ref, D, decimal=4)
+        np.testing.assert_array_equal(I_ref, I)
+
     def test_float(self):
         ds = datasets.SyntheticDataset(128, 2000, 2000, 200)
 
@@ -654,3 +671,63 @@ def test_code_set(self):
         np.testing.assert_equal(
             np.sort(np.unique(codes, axis=0), axis=None),
             np.sort(codes[inserted], axis=None))
+
+
+@unittest.skipIf(platform.system() == 'Windows',
+                'OnDiskInvertedLists is unsupported on Windows.')
+class TestMerge(unittest.TestCase):
+    @contextmanager
+    def temp_directory(self):
+        temp_dir = tempfile.mkdtemp()
+        try:
+            yield temp_dir
+        finally:
+            shutil.rmtree(temp_dir)
+
+    def do_test_ondisk_merge(self, shift_ids=False):
+        with self.temp_directory() as tmpdir:
+            # only train and add index to disk without adding elements.
+            # this will create empty inverted lists.
+            ds = datasets.SyntheticDataset(32, 2000, 200, 20)
+            index = faiss.index_factory(ds.d, "IVF32,Flat")
+            index.train(ds.get_train())
+            faiss.write_index(index, tmpdir + "/trained.index")
+
+            # create 4 shards and add elements to them
+            ns = 4  # number of shards
+
+            for bno in range(ns):
+                index = faiss.read_index(tmpdir + "/trained.index")
+                i0, i1 = int(bno * ds.nb / ns), int((bno + 1) * ds.nb / ns)
+                if shift_ids:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(0, ds.nb / ns))
+                else:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(i0, i1))
+                faiss.write_index(index, tmpdir + "/block_%d.index" % bno)
+
+            # construct the output index and merge them on disk
+            index = faiss.read_index(tmpdir + "/trained.index")
+            block_fnames = [tmpdir + "/block_%d.index" % bno for bno in range(4)]
+
+            merge_ondisk(
+                index, block_fnames, tmpdir + "/merged_index.ivfdata", shift_ids
+            )
+            faiss.write_index(index, tmpdir + "/populated.index")
+
+            # perform a search from index on disk
+            index = faiss.read_index(tmpdir + "/populated.index")
+            index.nprobe = 5
+            D, I = index.search(ds.xq, 5)
+
+            # ground-truth
+            gtI = ds.get_groundtruth(5)
+
+            recall_at_1 = (I[:, :1] == gtI[:, :1]).sum() / float(ds.xq.shape[0])
+            self.assertGreaterEqual(recall_at_1, 0.5)
+
+    def test_ondisk_merge(self):
+        self.do_test_ondisk_merge()
+
+    def test_ondisk_merge_with_shift_ids(self):
+        # verified that recall is same for test_ondisk_merge and
+        self.do_test_ondisk_merge(True)
diff --git a/tests/test_contrib_with_scipy.py b/tests/test_contrib_with_scipy.py
index cb81bb623c..4f89e2fc1b 100644
--- a/tests/test_contrib_with_scipy.py
+++ b/tests/test_contrib_with_scipy.py
@@ -44,7 +44,6 @@ def test_sparse_routines(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         centroids = ds.get_queries()
@@ -72,7 +71,6 @@ def test_sparse_kmeans(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         km = faiss.Kmeans(ds.d, 50)
diff --git a/tests/test_disable_pq_sdc_tables.cpp b/tests/test_disable_pq_sdc_tables.cpp
new file mode 100644
index 0000000000..b211a5c451
--- /dev/null
+++ b/tests/test_disable_pq_sdc_tables.cpp
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "faiss/Index.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/index_factory.h"
+#include "faiss/index_io.h"
+#include "test_util.h"
+
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+TEST(IO, TestReadHNSWPQ_whenSDCDisabledFlagPassed_thenDisableSDCTable) {
+    Tempfilename index_filename(&temp_file_mutex, "/tmp/faiss_TestReadHNSWPQ");
+    int d = 32, n = 256;
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 100);
+    std::vector<float> vectors(n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        vectors[i] = u(rng);
+    }
+
+    // Build the index and write it to the temp file
+    {
+        std::unique_ptr<faiss::Index> index_writer(
+                faiss::index_factory(d, "HNSW8,PQ4np", faiss::METRIC_L2));
+        index_writer->train(n, vectors.data());
+        index_writer->add(n, vectors.data());
+
+        faiss::write_index(index_writer.get(), index_filename.c_str());
+    }
+
+    // Load index from disk. Confirm that the sdc table is equal to 0 when
+    // disable sdc is set
+    {
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_read_write(
+                dynamic_cast<faiss::IndexHNSWPQ*>(
+                        faiss::read_index(index_filename.c_str())));
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_sdc_disabled(
+                dynamic_cast<faiss::IndexHNSWPQ*>(faiss::read_index(
+                        index_filename.c_str(),
+                        faiss::IO_FLAG_PQ_SKIP_SDC_TABLE)));
+
+        ASSERT_NE(
+                dynamic_cast<faiss::IndexPQ*>(index_reader_read_write->storage)
+                        ->pq.sdc_table.size(),
+                0);
+        ASSERT_EQ(
+                dynamic_cast<faiss::IndexPQ*>(
+                        index_reader_sdc_disabled->storage)
+                        ->pq.sdc_table.size(),
+                0);
+    }
+}
diff --git a/tests/test_extra_distances.py b/tests/test_extra_distances.py
index a474dd6ba7..fcaf4d383d 100644
--- a/tests/test_extra_distances.py
+++ b/tests/test_extra_distances.py
@@ -94,6 +94,33 @@ def test_jaccard(self):
         new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
         self.assertTrue(np.allclose(ref_dis, new_dis))
 
+    def test_nan_euclidean(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.sqeuclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+        x = [[3, np.nan, np.nan, 6]]
+        q = [[1, np.nan, np.nan, 5]]
+        dis = [(4 / 2 * ((3 - 1)**2 + (6 - 5)**2))]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(new_dis, dis))
+
+        x = [[np.nan] * 4]
+        q = [[np.nan] * 4]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.isnan(new_dis[0]))
+
+    def test_abs_inner_product(self):
+        xq, yb = self.make_example()
+        dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_ABS_INNER_PRODUCT)
+
+        gt_dis = np.abs(xq @ yb.T)
+        np.testing.assert_allclose(dis, gt_dis, atol=1e-5)
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/tests/test_fast_scan.py b/tests/test_fast_scan.py
index b061ee3af0..cfe9636fee 100644
--- a/tests/test_fast_scan.py
+++ b/tests/test_fast_scan.py
@@ -34,7 +34,6 @@ def test_PQ4_accuracy(self):
         nq = Iref.shape[0]
         recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq
         assert recall_at_1 > 0.6
-        # print(f'recall@1 = {recall_at_1:.3f}')
 
 
     # This is an experiment to see if we can catch performance
@@ -498,7 +497,6 @@ def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall = (Ia == gt).sum() / nq
 
-        print(aq, st, implem, metric_type, recall_ref, recall)
         assert abs(recall_ref - recall) < 0.05
 
     def xx_test_accuracy(self):
@@ -531,7 +529,6 @@ def subtest_from_idxaq(self, implem, metric):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_from_idxaq(self):
diff --git a/tests/test_fast_scan_ivf.py b/tests/test_fast_scan_ivf.py
index 5a57a39ca9..f48dd2e47a 100644
--- a/tests/test_fast_scan_ivf.py
+++ b/tests/test_fast_scan_ivf.py
@@ -84,9 +84,7 @@ def sp(x):
         b = btab[0]
         dis_new = self.compute_dis_quant(codes, LUTq, biasq, a, b)
 
-        #    print(a, b, dis_ref.sum())
         avg_realtive_error = np.abs(dis_new - dis_ref).sum() / dis_ref.sum()
-        # print('a=', a, 'avg_relative_error=', avg_realtive_error)
         self.assertLess(avg_realtive_error, 0.0005)
 
     def test_no_residual_ip(self):
@@ -133,8 +131,6 @@ def test_by_residual_L2_v2(self):
         self.do_test(LUT, bias, nprobe, alt_3d=True)
 
 
-
-
 ##########################################################
 # Tests for various IndexPQFastScan implementations
 ##########################################################
@@ -209,7 +205,6 @@ def test_by_residual_ip(self):
         self.do_test(True, faiss.METRIC_INNER_PRODUCT)
 
 
-
 class TestIVFImplem2(unittest.TestCase):
     """ Verify implem 2 (search with original invlists with uint8 LUTs)
     against IndexIVFPQ. Entails some loss in accuracy. """
@@ -231,8 +226,6 @@ def eval_quant_loss(self, by_residual, metric=faiss.METRIC_L2):
 
         m3 = three_metrics(Da, Ia, Db, Ib)
 
-
-        # print(by_residual, metric, recall_at_1, recall_at_10, intersection_at_10)
         ref_results = {
             (True, 1): [0.985, 1.0, 9.872],
             (True, 0): [ 0.987, 1.0, 9.914],
@@ -259,10 +252,12 @@ def test_qloss_no_residual_ip(self):
     def test_qloss_by_residual_ip(self):
         self.eval_quant_loss(True, faiss.METRIC_INNER_PRODUCT)
 
+
 class TestEquivPQ(unittest.TestCase):
 
     def test_equiv_pq(self):
         ds  = datasets.SyntheticDataset(32, 2000, 200, 4)
+        xq = ds.get_queries()
 
         index = faiss.index_factory(32, "IVF1,PQ16x4np")
         index.by_residual = False
@@ -270,7 +265,7 @@ def test_equiv_pq(self):
         index.quantizer.add(np.zeros((1, 32), dtype='float32'))
         index.train(ds.get_train())
         index.add(ds.get_database())
-        Dref, Iref = index.search(ds.get_queries(), 4)
+        Dref, Iref = index.search(xq, 4)
 
         index_pq = faiss.index_factory(32, "PQ16x4np")
         index_pq.pq = index.pq
@@ -278,21 +273,64 @@ def test_equiv_pq(self):
         index_pq.codes = faiss. downcast_InvertedLists(
             index.invlists).codes.at(0)
         index_pq.ntotal = index.ntotal
-        Dnew, Inew = index_pq.search(ds.get_queries(), 4)
+        Dnew, Inew = index_pq.search(xq, 4)
 
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
         index_pq2 = faiss.IndexPQFastScan(index_pq)
         index_pq2.implem = 12
-        Dref, Iref = index_pq2.search(ds.get_queries(), 4)
+        Dref, Iref = index_pq2.search(xq, 4)
 
         index2 = faiss.IndexIVFPQFastScan(index)
         index2.implem = 12
-        Dnew, Inew = index2.search(ds.get_queries(), 4)
+        Dnew, Inew = index2.search(xq, 4)
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
+        # test encode and decode
+
+        np.testing.assert_array_equal(
+            index_pq.sa_encode(xq),
+            index2.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_pq.sa_decode(index_pq.sa_encode(xq)),
+            index2.sa_decode(index2.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_pq.sa_decode(index_pq.sa_encode(xq)) - xq) ** 2).sum(1),
+            ((index2.sa_decode(index2.sa_encode(xq)) - xq) ** 2).sum(1)
+        )
+
+    def test_equiv_pq_encode_decode(self):
+        ds = datasets.SyntheticDataset(32, 1000, 200, 10)
+        xq = ds.get_queries()
+
+        index_ivfpq = faiss.index_factory(ds.d, "IVF10,PQ8x4np")
+        index_ivfpq.train(ds.get_train())
+
+        index_ivfpqfs = faiss.IndexIVFPQFastScan(index_ivfpq)
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_encode(xq),
+            index_ivfpqfs.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)),
+            index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)) - xq) ** 2)
+            .sum(1),
+            ((index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq)) - xq) ** 2)
+            .sum(1)
+        )
+
 
 class TestIVFImplem12(unittest.TestCase):
 
@@ -309,6 +347,7 @@ def do_test(self, by_residual, metric=faiss.METRIC_L2, d=32, nq=200):
         index.add(ds.get_database())
         index.nprobe = 4
 
+        # compare against implem = 2, which includes quantized LUTs
         index2 = faiss.IndexIVFPQFastScan(index)
         index2.implem = 2
         Dref, Iref = index2.search(ds.get_queries(), 4)
@@ -370,7 +409,6 @@ def test_by_residual_odd_dim_single_query(self):
         self.do_test(True, d=30, nq=1)
 
 
-
 class TestIVFImplem10(TestIVFImplem12):
     IMPLEM = 10
 
@@ -432,7 +470,6 @@ def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
                 new_code_i = new_code_per_id[the_id]
                 np.testing.assert_array_equal(ref_code_i, new_code_i)
 
-
     def test_add(self):
         self.do_test()
 
@@ -466,7 +503,6 @@ def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
         Dnew, Inew = index2.search(ds.get_queries(), 10)
 
         m3 = three_metrics(Dref, Iref, Dnew, Inew)
-        #   print((by_residual, metric, d), ":", m3)
         ref_m3_tab = {
             (True, 1, 32): (0.995, 1.0, 9.91),
             (True, 0, 32): (0.99, 1.0, 9.91),
@@ -557,7 +593,6 @@ def subtest_accuracy(self, aq, st, by_residual, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, metric_type, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.051
 
     def xx_test_accuracy(self):
@@ -602,7 +637,6 @@ def subtest_rescale_accuracy(self, aq, st, by_residual, implem):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_rescale_accuracy(self):
@@ -627,7 +661,6 @@ def subtest_from_ivfaq(self, implem):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.02
 
     def test_from_ivfaq(self):
@@ -766,7 +799,6 @@ def subtest_accuracy(self, paq):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(paq, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def test_accuracy_PLSQ(self):
@@ -812,3 +844,73 @@ def subtest_io(self, factory_str):
     def test_io(self):
         self.subtest_io('IVF16,PLSQ2x3x4fsr_Nlsq2x4')
         self.subtest_io('IVF16,PRQ2x3x4fs_Nrq2x4')
+
+
+class TestSearchParams(unittest.TestCase):
+
+    def test_search_params(self):
+        ds = datasets.SyntheticDataset(32, 500, 100, 10)
+
+        index = faiss.index_factory(ds.d, "IVF32,PQ16x4fs")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        index.nprobe
+        index.nprobe = 4
+        Dref4, Iref4 = index.search(ds.get_queries(), 10)
+        # index.nprobe = 16
+        # Dref16, Iref16 = index.search(ds.get_queries(), 10)
+
+        index.nprobe = 1
+        Dnew4, Inew4 = index.search(
+            ds.get_queries(), 10, params=faiss.IVFSearchParameters(nprobe=4))
+        np.testing.assert_array_equal(Dref4, Dnew4)
+        np.testing.assert_array_equal(Iref4, Inew4)
+
+
+class TestRangeSearchImplem12(unittest.TestCase):
+    IMPLEM = 12
+
+    def do_test(self, metric=faiss.METRIC_L2):
+        ds = datasets.SyntheticDataset(32, 750, 200, 100)
+
+        index = faiss.index_factory(ds.d, "IVF32,PQ16x4np", metric)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 4
+
+        # find a reasonable radius
+        D, I = index.search(ds.get_queries(), 10)
+        radius = np.median(D[:, -1])
+        lims1, D1, I1 = index.range_search(ds.get_queries(), radius)
+
+        index2 = faiss.IndexIVFPQFastScan(index)
+        index2.implem = self.IMPLEM
+        lims2, D2, I2 = index2.range_search(ds.get_queries(), radius)
+
+        nmiss = 0
+        nextra = 0
+
+        for i in range(ds.nq):
+            ref = set(I1[lims1[i]: lims1[i + 1]])
+            new = set(I2[lims2[i]: lims2[i + 1]])
+            nmiss += len(ref - new)
+            nextra += len(new - ref)
+
+        # need some tolerance because the look-up tables are quantized
+        self.assertLess(nmiss, 10)
+        self.assertLess(nextra, 10)
+
+    def test_L2(self):
+        self.do_test()
+
+    def test_IP(self):
+        self.do_test(metric=faiss.METRIC_INNER_PRODUCT)
+
+
+class TestRangeSearchImplem10(TestRangeSearchImplem12):
+    IMPLEM = 10
+
+
+class TestRangeSearchImplem110(TestRangeSearchImplem12):
+    IMPLEM = 110
diff --git a/tests/test_fastscan_perf.cpp b/tests/test_fastscan_perf.cpp
new file mode 100644
index 0000000000..f7d114d738
--- /dev/null
+++ b/tests/test_fastscan_perf.cpp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <omp.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+TEST(TestFastScan, knnVSrange) {
+    // small vectors and database
+    int d = 64;
+    size_t nb = 4000;
+
+    // ivf centroids
+    size_t nlist = 4;
+
+    // more than 2 threads to surface
+    // problems related to multi-threading
+    omp_set_num_threads(8);
+
+    // random database, also used as queries
+    std::vector<float> database(nb * d);
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+    for (size_t i = 0; i < nb * d; i++) {
+        database[i] = distrib(rng);
+    }
+
+    // build index
+    faiss::IndexFlatL2 coarse_quantizer(d);
+    faiss::IndexIVFPQFastScan index(
+            &coarse_quantizer, d, nlist, d / 2, 4, faiss::METRIC_L2, 32);
+    index.pq.cp.niter = 10; // speed up train
+    index.nprobe = nlist;
+    index.train(nb, database.data());
+    index.add(nb, database.data());
+
+    std::vector<float> distances(nb);
+    std::vector<faiss::idx_t> labels(nb);
+    auto t = std::chrono::high_resolution_clock::now();
+    index.search(nb, database.data(), 1, distances.data(), labels.data());
+    auto knn_time = std::chrono::high_resolution_clock::now() - t;
+
+    faiss::RangeSearchResult rsr(nb);
+    t = std::chrono::high_resolution_clock::now();
+    index.range_search(nb, database.data(), 1.0, &rsr);
+    auto range_time = std::chrono::high_resolution_clock::now() - t;
+
+    // we expect the perf of knn and range search
+    // to be similar, at least within a factor of 4
+    ASSERT_LE(range_time, knn_time * 4);
+    ASSERT_LE(knn_time, range_time * 4);
+}
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
new file mode 100644
index 0000000000..d5797186da
--- /dev/null
+++ b/tests/test_graph_based.py
@@ -0,0 +1,480 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+""" a few tests for graph-based indices (HNSW and NSG)"""
+
+import numpy as np
+import unittest
+import faiss
+import tempfile
+import os
+
+from common_faiss_tests import get_dataset_2
+
+
+class TestHNSW(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+
+        (_, self.xb, self.xq) = get_dataset_2(d, nt, nb, nq)
+        index = faiss.IndexFlatL2(d)
+        index.add(self.xb)
+        Dref, Iref = index.search(self.xq, 1)
+        self.Iref = Iref
+
+    def test_hnsw(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def test_range_search(self):
+        index_flat = faiss.IndexFlat(self.xb.shape[1])
+        index_flat.add(self.xb)
+        D, _ = index_flat.search(self.xq, 10)
+        radius = np.median(D[:, -1])
+        lims_ref, Dref, Iref = index_flat.range_search(self.xq, radius)
+
+        index = faiss.IndexHNSWFlat(self.xb.shape[1], 16)
+        index.add(self.xb)
+        lims, D, I = index.range_search(self.xq, radius)
+
+        nmiss = 0
+        # check if returned resutls are a subset of the reference results
+        for i in range(len(self.xq)):
+            ref = Iref[lims_ref[i]: lims_ref[i + 1]]
+            new = I[lims[i]: lims[i + 1]]
+            self.assertLessEqual(set(new), set(ref))
+            nmiss += len(ref) - len(new)
+        # currenly we miss 405 / 6019 neighbors
+        self.assertLessEqual(nmiss, lims_ref[-1] * 0.1)
+
+    def test_hnsw_unbounded_queue(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        index.search_bounded_queue = False
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def io_and_retest(self, index, Dhnsw, Ihnsw):
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        Dhnsw2, Ihnsw2 = index2.search(self.xq, 1)
+
+        self.assertTrue(np.all(Dhnsw2 == Dhnsw))
+        self.assertTrue(np.all(Ihnsw2 == Ihnsw))
+
+        # also test clone
+        index3 = faiss.clone_index(index)
+        Dhnsw3, Ihnsw3 = index3.search(self.xq, 1)
+
+        self.assertTrue(np.all(Dhnsw3 == Dhnsw))
+        self.assertTrue(np.all(Ihnsw3 == Ihnsw))
+
+    def test_hnsw_2level(self):
+        d = self.xq.shape[1]
+
+        quant = faiss.IndexFlatL2(d)
+
+        index = faiss.IndexHNSW2Level(quant, 256, 8, 8)
+        index.train(self.xb)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 307)
+
+        self.io_and_retest(index, Dhnsw, Ihnsw)
+
+    def test_add_0_vecs(self):
+        index = faiss.IndexHNSWFlat(10, 16)
+        zero_vecs = np.zeros((0, 10), dtype='float32')
+        # infinite loop
+        index.add(zero_vecs)
+
+    def test_hnsw_IP(self):
+        d = self.xq.shape[1]
+
+        index_IP = faiss.IndexFlatIP(d)
+        index_IP.add(self.xb)
+        Dref, Iref = index_IP.search(self.xq, 1)
+
+        index = faiss.IndexHNSWFlat(d, 16, faiss.METRIC_INNER_PRODUCT)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+
+        self.assertGreaterEqual((Iref == Ihnsw).sum(), 470)
+
+        mask = Iref[:, 0] == Ihnsw[:, 0]
+        assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
+
+    def test_ndis_stats(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        stats = faiss.cvar.hnsw_stats
+        stats.reset()
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+        self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
+
+    def test_io_no_storage(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+
+        Dref, Iref = index.search(self.xq, 5)
+
+        # test writing without storage
+        index2 = faiss.deserialize_index(
+            faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
+        )
+        self.assertEqual(index2.storage, None)
+        self.assertRaises(
+            RuntimeError,
+            index2.search, self.xb, 1)
+
+        # make sure we can store an index with empty storage
+        index4 = faiss.deserialize_index(
+            faiss.serialize_index(index2))
+
+        # add storage afterwards
+        index.storage = faiss.clone_index(index.storage)
+        index.own_fields = True
+
+        Dnew, Inew = index.search(self.xq, 5)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+        if False:
+            # test reading without storage
+            # not implemented because it is hard to skip over an index
+            index3 = faiss.deserialize_index(
+                faiss.serialize_index(index), faiss.IO_FLAG_SKIP_STORAGE
+            )
+            self.assertEquals(index3.storage, None)
+
+    def test_abs_inner_product(self):
+        """Test HNSW with abs inner product (not a real distance, so dubious that triangular inequality works)"""
+        d = self.xq.shape[1]
+        xb = self.xb - self.xb.mean(axis=0)  # need to be centered to give interesting directions
+        xq = self.xq - self.xq.mean(axis=0)
+        Dref, Iref = faiss.knn(xq, xb, 10, faiss.METRIC_ABS_INNER_PRODUCT)
+        
+        index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_ABS_INNER_PRODUCT)
+        index.add(xb)
+        Dnew, Inew = index.search(xq, 10)
+
+        inter = faiss.eval_intersection(Iref, Inew)
+        # 4769 vs. 500*10
+        self.assertGreater(inter, Iref.size * 0.9)
+ 
+
+class TestNSG(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+        self.GK = 32
+
+        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
+
+    def make_knn_graph(self, metric):
+        n = self.xb.shape[0]
+        d = self.xb.shape[1]
+        index = faiss.IndexFlat(d, metric)
+        index.add(self.xb)
+        _, I = index.search(self.xb, self.GK + 1)
+        knn_graph = np.zeros((n, self.GK), dtype=np.int64)
+
+        # For the inner product distance, the distance between a vector and
+        # itself may not be the smallest, so it is not guaranteed that I[:, 0]
+        # is the query itself.
+        for i in range(n):
+            cnt = 0
+            for j in range(self.GK + 1):
+                if I[i, j] != i:
+                    knn_graph[i, cnt] = I[i, j]
+                    cnt += 1
+                if cnt == self.GK:
+                    break
+        return knn_graph
+
+    def subtest_io_and_clone(self, index, Dnsg, Insg):
+        fd, tmpfile = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, tmpfile)
+            index2 = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+
+        Dnsg2, Insg2 = index2.search(self.xq, 1)
+        np.testing.assert_array_equal(Dnsg2, Dnsg)
+        np.testing.assert_array_equal(Insg2, Insg)
+
+        # also test clone
+        index3 = faiss.clone_index(index)
+        Dnsg3, Insg3 = index3.search(self.xq, 1)
+        np.testing.assert_array_equal(Dnsg3, Dnsg)
+        np.testing.assert_array_equal(Insg3, Insg)
+
+    def subtest_connectivity(self, index, nb):
+        vt = faiss.VisitedTable(nb)
+        count = index.nsg.dfs(vt, index.nsg.enterpoint, 0)
+        self.assertEqual(count, nb)
+
+    def subtest_add(self, build_type, thresh, metric=faiss.METRIC_L2):
+        d = self.xq.shape[1]
+        metrics = {faiss.METRIC_L2: 'L2',
+                   faiss.METRIC_INNER_PRODUCT: 'IP'}
+
+        flat_index = faiss.IndexFlat(d, metric)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, 1)
+
+        index = faiss.IndexNSGFlat(d, 16, metric)
+        index.verbose = True
+        index.build_type = build_type
+        index.GK = self.GK
+        index.add(self.xb)
+        Dnsg, Insg = index.search(self.xq, 1)
+
+        recalls = (Iref == Insg).sum()
+        self.assertGreaterEqual(recalls, thresh)
+        self.subtest_connectivity(index, self.xb.shape[0])
+        self.subtest_io_and_clone(index, Dnsg, Insg)
+
+    def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
+        d = self.xq.shape[1]
+        metrics = {faiss.METRIC_L2: 'L2',
+                   faiss.METRIC_INNER_PRODUCT: 'IP'}
+
+        flat_index = faiss.IndexFlat(d, metric)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, 1)
+
+        index = faiss.IndexNSGFlat(d, 16, metric)
+        index.verbose = True
+
+        index.build(self.xb, knn_graph)
+        Dnsg, Insg = index.search(self.xq, 1)
+
+        recalls = (Iref == Insg).sum()
+        self.assertGreaterEqual(recalls, thresh)
+        self.subtest_connectivity(index, self.xb.shape[0])
+
+    def test_add_bruteforce_L2(self):
+        self.subtest_add(0, 475, faiss.METRIC_L2)
+
+    def test_add_nndescent_L2(self):
+        self.subtest_add(1, 475, faiss.METRIC_L2)
+
+    def test_add_bruteforce_IP(self):
+        self.subtest_add(0, 480, faiss.METRIC_INNER_PRODUCT)
+
+    def test_add_nndescent_IP(self):
+        self.subtest_add(1, 480, faiss.METRIC_INNER_PRODUCT)
+
+    def test_build_L2(self):
+        knn_graph = self.make_knn_graph(faiss.METRIC_L2)
+        self.subtest_build(knn_graph, 475, faiss.METRIC_L2)
+
+    def test_build_IP(self):
+        knn_graph = self.make_knn_graph(faiss.METRIC_INNER_PRODUCT)
+        self.subtest_build(knn_graph, 480, faiss.METRIC_INNER_PRODUCT)
+
+    def test_build_invalid_knng(self):
+        """Make some invalid entries in the input knn graph.
+
+        It would cause a warning but IndexNSG should be able
+        to handel this.
+        """
+        knn_graph = self.make_knn_graph(faiss.METRIC_L2)
+        knn_graph[:100, 5] = -111
+        self.subtest_build(knn_graph, 475, faiss.METRIC_L2)
+
+        knn_graph = self.make_knn_graph(faiss.METRIC_INNER_PRODUCT)
+        knn_graph[:100, 5] = -111
+        self.subtest_build(knn_graph, 480, faiss.METRIC_INNER_PRODUCT)
+
+    def test_reset(self):
+        """test IndexNSG.reset()"""
+        d = self.xq.shape[1]
+        metrics = {faiss.METRIC_L2: 'L2',
+                   faiss.METRIC_INNER_PRODUCT: 'IP'}
+
+        metric = faiss.METRIC_L2
+        flat_index = faiss.IndexFlat(d, metric)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, 1)
+
+        index = faiss.IndexNSGFlat(d, 16)
+        index.verbose = True
+        index.GK = 32
+
+        index.add(self.xb)
+        Dnsg, Insg = index.search(self.xq, 1)
+        recalls = (Iref == Insg).sum()
+        self.assertGreaterEqual(recalls, 475)
+        self.subtest_connectivity(index, self.xb.shape[0])
+
+        index.reset()
+        index.add(self.xb)
+        Dnsg, Insg = index.search(self.xq, 1)
+        recalls = (Iref == Insg).sum()
+        self.assertGreaterEqual(recalls, 475)
+        self.subtest_connectivity(index, self.xb.shape[0])
+
+    def test_order(self):
+        """make sure that output results are sorted"""
+        d = self.xq.shape[1]
+        index = faiss.IndexNSGFlat(d, 32)
+
+        index.train(self.xb)
+        index.add(self.xb)
+
+        k = 10
+        nq = self.xq.shape[0]
+        D, _ = index.search(self.xq, k)
+
+        indices = np.argsort(D, axis=1)
+        gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
+        gt = np.repeat(gt, nq, axis=0)  # [nq, k]
+        np.testing.assert_array_equal(indices, gt)
+
+    def test_nsg_pq(self):
+        """Test IndexNSGPQ"""
+        d = self.xq.shape[1]
+        R, pq_M = 32, 4
+        index = faiss.index_factory(d, f"NSG{R}_PQ{pq_M}np")
+        assert isinstance(index, faiss.IndexNSGPQ)
+        idxpq = faiss.downcast_index(index.storage)
+        assert index.nsg.R == R and idxpq.pq.M == pq_M
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.GK = 32
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        self.assertGreaterEqual(recalls, 190)  # 193
+
+        # test I/O
+        self.subtest_io_and_clone(index, D, I)
+
+    def test_nsg_sq(self):
+        """Test IndexNSGSQ"""
+        d = self.xq.shape[1]
+        R = 32
+        index = faiss.index_factory(d, f"NSG{R}_SQ8")
+        assert isinstance(index, faiss.IndexNSGSQ)
+        idxsq = faiss.downcast_index(index.storage)
+        assert index.nsg.R == R
+        assert idxsq.sq.qtype == faiss.ScalarQuantizer.QT_8bit
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        self.assertGreaterEqual(recalls, 405)  # 411
+
+        # test I/O
+        self.subtest_io_and_clone(index, D, I)
+
+
+class TestNNDescent(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+        self.GK = 32
+
+        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
+
+    def test_nndescentflat(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexNNDescentFlat(d, 32)
+        index.nndescent.search_L = 8
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        self.assertGreaterEqual(recalls, 450)  # 462
+
+        # do some IO tests
+        fd, tmpfile = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, tmpfile)
+            index2 = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+
+        D2, I2 = index2.search(self.xq, 1)
+        np.testing.assert_array_equal(D2, D)
+        np.testing.assert_array_equal(I2, I)
+
+        # also test clone
+        index3 = faiss.clone_index(index)
+        D3, I3 = index3.search(self.xq, 1)
+        np.testing.assert_array_equal(D3, D)
+        np.testing.assert_array_equal(I3, I)
+
+    def test_order(self):
+        """make sure that output results are sorted"""
+        d = self.xq.shape[1]
+        index = faiss.IndexNNDescentFlat(d, 32)
+
+        index.train(self.xb)
+        index.add(self.xb)
+
+        k = 10
+        nq = self.xq.shape[0]
+        D, _ = index.search(self.xq, k)
+
+        indices = np.argsort(D, axis=1)
+        gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
+        gt = np.repeat(gt, nq, axis=0)  # [nq, k]
+        np.testing.assert_array_equal(indices, gt)
diff --git a/tests/test_index.py b/tests/test_index.py
index 0e828e08c1..43db906e47 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -327,7 +327,7 @@ def test_4variants_ivf(self):
         D, I = index.search(xq, 10)
         nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
                                                   qtype, faiss.METRIC_L2)
@@ -338,7 +338,6 @@ def test_4variants_ivf(self):
             D, I = index.search(xq, 10)
 
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
-        print(nok, nq)
 
         self.assertGreaterEqual(nok['flat'], nq * 0.6)
         # The tests below are a bit fragile, it happens that the
@@ -350,6 +349,7 @@ def test_4variants_ivf(self):
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nok['QT_8bit'])
 
     def test_4variants(self):
         d = 32
@@ -365,7 +365,7 @@ def test_4variants(self):
 
         nok = {}
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
             index.train(xt)
@@ -373,13 +373,12 @@ def test_4variants(self):
             D, I = index.search(xq, 10)
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        print(nok, nq)
-
         self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nq * 0.9)
 
 
 class TestRangeSearch(unittest.TestCase):
@@ -442,7 +441,6 @@ def norm1(x):
 
         recons_err = np.mean(norm1(R_flat - xb[I_flat]))
 
-        print('Reconstruction error = %.3f' % recons_err)
         if eps is not None:
             self.assertLessEqual(recons_err, eps)
 
@@ -526,406 +524,6 @@ def test_IndexTransform(self):
         self.run_search_and_reconstruct(index, xb, xq)
 
 
-class TestHNSW(unittest.TestCase):
-
-    def __init__(self, *args, **kwargs):
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        d = 32
-        nt = 0
-        nb = 1500
-        nq = 500
-
-        (_, self.xb, self.xq) = get_dataset_2(d, nt, nb, nq)
-        index = faiss.IndexFlatL2(d)
-        index.add(self.xb)
-        Dref, Iref = index.search(self.xq, 1)
-        self.Iref = Iref
-
-    def test_hnsw(self):
-        d = self.xq.shape[1]
-
-        index = faiss.IndexHNSWFlat(d, 16)
-        index.add(self.xb)
-        Dhnsw, Ihnsw = index.search(self.xq, 1)
-
-        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
-
-        self.io_and_retest(index, Dhnsw, Ihnsw)
-
-    def test_hnsw_unbounded_queue(self):
-        d = self.xq.shape[1]
-
-        index = faiss.IndexHNSWFlat(d, 16)
-        index.add(self.xb)
-        index.search_bounded_queue = False
-        Dhnsw, Ihnsw = index.search(self.xq, 1)
-
-        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)
-
-        self.io_and_retest(index, Dhnsw, Ihnsw)
-
-    def io_and_retest(self, index, Dhnsw, Ihnsw):
-        fd, tmpfile = tempfile.mkstemp()
-        os.close(fd)
-        try:
-            faiss.write_index(index, tmpfile)
-            index2 = faiss.read_index(tmpfile)
-        finally:
-            if os.path.exists(tmpfile):
-                os.unlink(tmpfile)
-
-        Dhnsw2, Ihnsw2 = index2.search(self.xq, 1)
-
-        self.assertTrue(np.all(Dhnsw2 == Dhnsw))
-        self.assertTrue(np.all(Ihnsw2 == Ihnsw))
-
-        # also test clone
-        index3 = faiss.clone_index(index)
-        Dhnsw3, Ihnsw3 = index3.search(self.xq, 1)
-
-        self.assertTrue(np.all(Dhnsw3 == Dhnsw))
-        self.assertTrue(np.all(Ihnsw3 == Ihnsw))
-
-
-    def test_hnsw_2level(self):
-        d = self.xq.shape[1]
-
-        quant = faiss.IndexFlatL2(d)
-
-        index = faiss.IndexHNSW2Level(quant, 256, 8, 8)
-        index.train(self.xb)
-        index.add(self.xb)
-        Dhnsw, Ihnsw = index.search(self.xq, 1)
-
-        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 307)
-
-        self.io_and_retest(index, Dhnsw, Ihnsw)
-
-    def test_add_0_vecs(self):
-        index = faiss.IndexHNSWFlat(10, 16)
-        zero_vecs = np.zeros((0, 10), dtype='float32')
-        # infinite loop
-        index.add(zero_vecs)
-
-    def test_hnsw_IP(self):
-        d = self.xq.shape[1]
-
-        index_IP = faiss.IndexFlatIP(d)
-        index_IP.add(self.xb)
-        Dref, Iref = index_IP.search(self.xq, 1)
-
-        index = faiss.IndexHNSWFlat(d, 16, faiss.METRIC_INNER_PRODUCT)
-        index.add(self.xb)
-        Dhnsw, Ihnsw = index.search(self.xq, 1)
-
-        print('nb equal: ', (Iref == Ihnsw).sum())
-
-        self.assertGreaterEqual((Iref == Ihnsw).sum(), 470)
-
-        mask = Iref[:, 0] == Ihnsw[:, 0]
-        assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
-
-
-class TestNSG(unittest.TestCase):
-
-    def __init__(self, *args, **kwargs):
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        d = 32
-        nt = 0
-        nb = 1500
-        nq = 500
-        self.GK = 32
-
-        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
-
-    def make_knn_graph(self, metric):
-        n = self.xb.shape[0]
-        d = self.xb.shape[1]
-        index = faiss.IndexFlat(d, metric)
-        index.add(self.xb)
-        _, I = index.search(self.xb, self.GK + 1)
-        knn_graph = np.zeros((n, self.GK), dtype=np.int64)
-
-        # For the inner product distance, the distance between a vector and itself
-        # may not be the smallest, so it is not guaranteed that I[:, 0] is the query itself.
-        for i in range(n):
-            cnt = 0
-            for j in range(self.GK + 1):
-                if I[i, j] != i:
-                    knn_graph[i, cnt] = I[i, j]
-                    cnt += 1
-                if cnt == self.GK:
-                    break
-        return knn_graph
-
-    def subtest_io_and_clone(self, index, Dnsg, Insg):
-        fd, tmpfile = tempfile.mkstemp()
-        os.close(fd)
-        try:
-            faiss.write_index(index, tmpfile)
-            index2 = faiss.read_index(tmpfile)
-        finally:
-            if os.path.exists(tmpfile):
-                os.unlink(tmpfile)
-
-        Dnsg2, Insg2 = index2.search(self.xq, 1)
-        np.testing.assert_array_equal(Dnsg2, Dnsg)
-        np.testing.assert_array_equal(Insg2, Insg)
-
-        # also test clone
-        index3 = faiss.clone_index(index)
-        Dnsg3, Insg3 = index3.search(self.xq, 1)
-        np.testing.assert_array_equal(Dnsg3, Dnsg)
-        np.testing.assert_array_equal(Insg3, Insg)
-
-    def subtest_connectivity(self, index, nb):
-        vt = faiss.VisitedTable(nb)
-        count = index.nsg.dfs(vt, index.nsg.enterpoint, 0)
-        self.assertEqual(count, nb)
-
-    def subtest_add(self, build_type, thresh, metric=faiss.METRIC_L2):
-        d = self.xq.shape[1]
-        metrics = {faiss.METRIC_L2: 'L2',
-                   faiss.METRIC_INNER_PRODUCT: 'IP'}
-
-        flat_index = faiss.IndexFlat(d, metric)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, 1)
-
-        index = faiss.IndexNSGFlat(d, 16, metric)
-        index.verbose = True
-        index.build_type = build_type
-        index.GK = self.GK
-        index.add(self.xb)
-        Dnsg, Insg = index.search(self.xq, 1)
-
-        recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
-        self.assertGreaterEqual(recalls, thresh)
-        self.subtest_connectivity(index, self.xb.shape[0])
-        self.subtest_io_and_clone(index, Dnsg, Insg)
-
-    def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
-        d = self.xq.shape[1]
-        metrics = {faiss.METRIC_L2: 'L2',
-                   faiss.METRIC_INNER_PRODUCT: 'IP'}
-
-        flat_index = faiss.IndexFlat(d, metric)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, 1)
-
-        index = faiss.IndexNSGFlat(d, 16, metric)
-        index.verbose = True
-
-        index.build(self.xb, knn_graph)
-        Dnsg, Insg = index.search(self.xq, 1)
-
-        recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
-        self.assertGreaterEqual(recalls, thresh)
-        self.subtest_connectivity(index, self.xb.shape[0])
-
-    def test_add_bruteforce_L2(self):
-        self.subtest_add(0, 475, faiss.METRIC_L2)
-
-    def test_add_nndescent_L2(self):
-        self.subtest_add(1, 475, faiss.METRIC_L2)
-
-    def test_add_bruteforce_IP(self):
-        self.subtest_add(0, 480, faiss.METRIC_INNER_PRODUCT)
-
-    def test_add_nndescent_IP(self):
-        self.subtest_add(1, 480, faiss.METRIC_INNER_PRODUCT)
-
-    def test_build_L2(self):
-        knn_graph = self.make_knn_graph(faiss.METRIC_L2)
-        self.subtest_build(knn_graph, 475, faiss.METRIC_L2)
-
-    def test_build_IP(self):
-        knn_graph = self.make_knn_graph(faiss.METRIC_INNER_PRODUCT)
-        self.subtest_build(knn_graph, 480, faiss.METRIC_INNER_PRODUCT)
-
-    def test_build_invalid_knng(self):
-        """Make some invalid entries in the input knn graph.
-
-        It would cause a warning but IndexNSG should be able
-        to handel this.
-        """
-        knn_graph = self.make_knn_graph(faiss.METRIC_L2)
-        knn_graph[:100, 5] = -111
-        self.subtest_build(knn_graph, 475, faiss.METRIC_L2)
-
-        knn_graph = self.make_knn_graph(faiss.METRIC_INNER_PRODUCT)
-        knn_graph[:100, 5] = -111
-        self.subtest_build(knn_graph, 480, faiss.METRIC_INNER_PRODUCT)
-
-    def test_reset(self):
-        """test IndexNSG.reset()"""
-        d = self.xq.shape[1]
-        metrics = {faiss.METRIC_L2: 'L2',
-                   faiss.METRIC_INNER_PRODUCT: 'IP'}
-
-        metric = faiss.METRIC_L2
-        flat_index = faiss.IndexFlat(d, metric)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, 1)
-
-        index = faiss.IndexNSGFlat(d, 16)
-        index.verbose = True
-        index.GK = 32
-
-        index.add(self.xb)
-        Dnsg, Insg = index.search(self.xq, 1)
-        recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
-        self.assertGreaterEqual(recalls, 475)
-        self.subtest_connectivity(index, self.xb.shape[0])
-
-        index.reset()
-        index.add(self.xb)
-        Dnsg, Insg = index.search(self.xq, 1)
-        recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
-        self.assertGreaterEqual(recalls, 475)
-        self.subtest_connectivity(index, self.xb.shape[0])
-
-    def test_order(self):
-        """make sure that output results are sorted"""
-        d = self.xq.shape[1]
-        index = faiss.IndexNSGFlat(d, 32)
-
-        index.train(self.xb)
-        index.add(self.xb)
-
-        k = 10
-        nq = self.xq.shape[0]
-        D, _ = index.search(self.xq, k)
-
-        indices = np.argsort(D, axis=1)
-        gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
-        gt = np.repeat(gt, nq, axis=0)  # [nq, k]
-        np.testing.assert_array_equal(indices, gt)
-
-    def test_nsg_pq(self):
-        """Test IndexNSGPQ"""
-        d = self.xq.shape[1]
-        R, pq_M = 32, 4
-        index = faiss.index_factory(d, f"NSG{R}_PQ{pq_M}np")
-        assert isinstance(index, faiss.IndexNSGPQ)
-        idxpq = faiss.downcast_index(index.storage)
-        assert index.nsg.R == R and idxpq.pq.M == pq_M
-
-        flat_index = faiss.IndexFlat(d)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, k=1)
-
-        index.GK = 32
-        index.train(self.xb)
-        index.add(self.xb)
-        D, I = index.search(self.xq, k=1)
-
-        # test accuracy
-        recalls = (Iref == I).sum()
-        print("IndexNSGPQ", recalls)
-        self.assertGreaterEqual(recalls, 190)  # 193
-
-        # test I/O
-        self.subtest_io_and_clone(index, D, I)
-
-    def test_nsg_sq(self):
-        """Test IndexNSGSQ"""
-        d = self.xq.shape[1]
-        R = 32
-        index = faiss.index_factory(d, f"NSG{R}_SQ8")
-        assert isinstance(index, faiss.IndexNSGSQ)
-        idxsq = faiss.downcast_index(index.storage)
-        assert index.nsg.R == R
-        assert idxsq.sq.qtype == faiss.ScalarQuantizer.QT_8bit
-
-        flat_index = faiss.IndexFlat(d)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, k=1)
-
-        index.train(self.xb)
-        index.add(self.xb)
-        D, I = index.search(self.xq, k=1)
-
-        # test accuracy
-        recalls = (Iref == I).sum()
-        print("IndexNSGSQ", recalls)
-        self.assertGreaterEqual(recalls, 405)  # 411
-
-        # test I/O
-        self.subtest_io_and_clone(index, D, I)
-
-
-class TestNNDescent(unittest.TestCase):
-
-    def __init__(self, *args, **kwargs):
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        d = 32
-        nt = 0
-        nb = 1500
-        nq = 500
-        self.GK = 32
-
-        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
-
-    def test_nndescentflat(self):
-        d = self.xq.shape[1]
-        index = faiss.IndexNNDescentFlat(d, 32)
-        index.nndescent.search_L = 8
-
-        flat_index = faiss.IndexFlat(d)
-        flat_index.add(self.xb)
-        Dref, Iref = flat_index.search(self.xq, k=1)
-
-        index.train(self.xb)
-        index.add(self.xb)
-        D, I = index.search(self.xq, k=1)
-
-        # test accuracy
-        recalls = (Iref == I).sum()
-        print("IndexNNDescentFlat", recalls)
-        self.assertGreaterEqual(recalls, 450)  # 462
-
-        # do some IO tests
-        fd, tmpfile = tempfile.mkstemp()
-        os.close(fd)
-        try:
-            faiss.write_index(index, tmpfile)
-            index2 = faiss.read_index(tmpfile)
-        finally:
-            if os.path.exists(tmpfile):
-                os.unlink(tmpfile)
-
-        D2, I2 = index2.search(self.xq, 1)
-        np.testing.assert_array_equal(D2, D)
-        np.testing.assert_array_equal(I2, I)
-
-        # also test clone
-        index3 = faiss.clone_index(index)
-        D3, I3 = index3.search(self.xq, 1)
-        np.testing.assert_array_equal(D3, D)
-        np.testing.assert_array_equal(I3, I)
-
-    def test_order(self):
-        """make sure that output results are sorted"""
-        d = self.xq.shape[1]
-        index = faiss.IndexNNDescentFlat(d, 32)
-
-        index.train(self.xb)
-        index.add(self.xb)
-
-        k = 10
-        nq = self.xq.shape[0]
-        D, _ = index.search(self.xq, k)
-
-        indices = np.argsort(D, axis=1)
-        gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
-        gt = np.repeat(gt, nq, axis=0)  # [nq, k]
-        np.testing.assert_array_equal(indices, gt)
-
 
 class TestDistancesPositive(unittest.TestCase):
 
@@ -1038,7 +636,6 @@ def test_reconstuct_after_add(self):
 
         # should not raise an exception
         index.reconstruct(5)
-        print(index.ntotal)
         index.reconstruct(150)
 
 
diff --git a/tests/test_index_accuracy.py b/tests/test_index_accuracy.py
index 44b4ca365f..8d8b4a28f6 100644
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
@@ -56,7 +56,6 @@ def test_ivf_kmeans(self):
         Dref, Iref = ivfk.search(ev.xq, 100)
         ivfk.parallel_mode = 1
         Dnew, Inew = ivfk.search(ev.xq, 100)
-        print((Iref != Inew).sum(), Iref.size)
         assert (Iref != Inew).sum() < Iref.size / 5000.0
         assert np.all(Dref == Dnew)
 
@@ -136,8 +135,6 @@ def test_polysemous(self):
 
         res = ev.launch("Polysemous ht=%d" % index.polysemous_ht, index)
         e_polysemous = ev.evalres(res)
-        print(e_baseline, e_polysemous, index.polysemous_ht)
-        print(stats.n_hamming_pass, stats.ncode)
         # The randu dataset is difficult, so we are not too picky on
         # the results. Here we assert that we have < 10 % loss when
         # computing full PQ on fewer than 20% of the data.
@@ -248,7 +245,6 @@ def subtest(self, mt):
             index.nprobe = 4  # hopefully more robust than 1
             D, I = index.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, repr(qname), ninter))
             assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
 
             if qname == "6bit":
@@ -264,7 +260,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            # print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -278,14 +273,11 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            # print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.01
 
             for pm in 1, 2:
-                # print("parallel_mode=%d" % pm)
                 index.parallel_mode = pm
                 lims4, D4, I4 = index.range_search(xq, radius)
-                # print("sizes", lims4[1:] - lims4[:-1])
                 for qno in range(len(lims) - 1):
                     Iref = I3[lims[qno]: lims[qno + 1]]
                     Inew = I4[lims4[qno]: lims4[qno + 1]]
@@ -485,7 +477,6 @@ def subtest(self, mt):
             D, I = index.search(xq, 10)
 
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, by_residual, ninter))
 
             assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
 
@@ -499,10 +490,6 @@ def subtest(self, mt):
                 index.polysemous_ht = 20
                 D, I = index.search(xq, 10)
                 ninter = faiss.eval_intersection(I, gt_I)
-                print(
-                    "(%d, %s, %d): %d, "
-                    % (mt, by_residual, index.polysemous_ht, ninter)
-                )
 
                 # polysemous behaves bizarrely on ARM
                 assert (
@@ -516,7 +503,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -530,7 +516,6 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
     def test_IVFPQ_non8bit(self):
@@ -555,7 +540,6 @@ def test_IVFPQ_non8bit(self):
 
             D, I = index.search(xq, 10)
             ninter[v] = faiss.eval_intersection(I, gt_I)
-        print("ninter=", ninter)
         # this should be the case but we don't observe
         # that... Probavly too few test points
         #  assert ninter['2x8'] > ninter['8x2']
@@ -623,9 +607,6 @@ def test_OPQ(self):
         res = ev.launch("OPQ", index)
         e_opq = ev.evalres(res)
 
-        print("e_pq=%s" % e_pq)
-        print("e_opq=%s" % e_opq)
-
         # verify that OPQ better than PQ
         for r in 1, 10, 100:
             assert e_opq[r] > e_pq[r]
@@ -639,14 +620,14 @@ def test_OIVFPQ(self):
         d = ev.d
         quantizer = faiss.IndexFlatL2(d)
         index = faiss.IndexIVFPQ(quantizer, d, ncentroids, M, 8)
-        index.nprobe = 5
+        index.nprobe = 12
 
         res = ev.launch("IVFPQ", index)
         e_ivfpq = ev.evalres(res)
 
         quantizer = faiss.IndexFlatL2(d)
         index_ivfpq = faiss.IndexIVFPQ(quantizer, d, ncentroids, M, 8)
-        index_ivfpq.nprobe = 5
+        index_ivfpq.nprobe = 12
         opq_matrix = faiss.OPQMatrix(d, M)
         opq_matrix.niter = 10
         index = faiss.IndexPreTransform(opq_matrix, index_ivfpq)
@@ -656,7 +637,6 @@ def test_OIVFPQ(self):
 
         # verify same on OIVFPQ
         for r in 1, 10, 100:
-            print(e_oivfpq[r], e_ivfpq[r])
             assert e_oivfpq[r] >= e_ivfpq[r]
 
 
@@ -758,9 +738,6 @@ def test_sh(self):
                     ninter = faiss.eval_intersection(I, gt_I)
                     key = (nbit, tt, period)
 
-                    print("(%d, %s, %g): %d, " % (nbit, repr(tt), period,
-                                                  ninter))
-                    print(abs(ninter - self.ref_results[key]))
                     assert abs(ninter - self.ref_results[key]) <= 14
 
 
@@ -799,7 +776,6 @@ def do_test(self, metric):
         # check that with refinement, the recall@10 is the same as
         # the original recall@100
         recall2 = (I2 == Iref[:, :1]).sum()
-        # print("recalls", recall1, recall2)
         self.assertEqual(recall1, recall2)
 
     def test_IP(self):
diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index 312530ad46..7820cb6627 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -100,6 +100,9 @@ def test_flat(self):
         index.add(self.xb)
         D, I = index.search(self.xq, 3)
 
+        I2 = index.assign(x=self.xq, k=3, labels=None)
+        assert np.all(I == I2)
+
         for i in range(nq):
             for j, dj in zip(I[i], D[i]):
                 ref_dis = binary_dis(self.xq[i], self.xb[j])
@@ -139,10 +142,18 @@ def test_range_search(self):
                 self.assertTrue(set(range_res) <= set(I[i]))
                 nt2 += 1
             # in case of equality we have a problem with ties
-        print('nb tests', nt1, nt2)
         # nb tests is actually low...
         self.assertTrue(nt1 > 19 and nt2 > 19)
 
+    def test_reconstruct(self):
+        index = faiss.IndexBinaryFlat(64)
+        input_vector = np.random.randint(0, 255, size=(10, index.code_size)).astype("uint8")
+        index.add(input_vector)
+
+        reconstructed_vector = index.reconstruct_n(0, 4)
+        assert reconstructed_vector.shape == (4, index.code_size)
+        assert np.all(input_vector[:4] == reconstructed_vector)
+
 
 class TestBinaryIVF(unittest.TestCase):
 
@@ -275,8 +286,6 @@ def test_ivf_nprobe(self):
         ref_index.add(xb)
         ref_D, ref_I = ref_index.search(xq, k)
 
-        print(D[0], ref_D[0])
-        print(I[0], ref_I[0])
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 
diff --git a/tests/test_index_composite.py b/tests/test_index_composite.py
index a760c0cf09..8d9b441adc 100644
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@@ -168,8 +168,6 @@ def test_remove_id_map_2(self):
         index.remove_ids(remove_set)
         index.add_with_ids(X[5:, :], idx[5:])
 
-        print (index.search(X, 1))
-
         for i in range(10):
             _, searchres = index.search(X[i:i + 1, :], 1)
             if idx[i] in remove_set:
@@ -954,7 +952,6 @@ def do_test(self, factory_string):
         index.nprobe = 10
         Dref, Iref = index.search(ds.get_queries(), 10)
 
-        #print(index.search_and_return_codes)
         D, I, codes = index.search_and_return_codes(
             ds.get_queries(), 10, include_listnos=True)
 
diff --git a/tests/test_io.py b/tests/test_io.py
index dc8ac3dcfb..99dfe60847 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -102,7 +102,6 @@ def test_buf_read(self):
                 reader = faiss.BufferedIOReader(reader, bsz)
 
                 y = np.zeros_like(x)
-                print('nbytes=', y.nbytes)
                 reader(faiss.swig_ptr(y), y.nbytes, 1)
 
             np.testing.assert_array_equal(x, y)
diff --git a/tests/test_ivf_index.cpp b/tests/test_ivf_index.cpp
new file mode 100644
index 0000000000..54cb7945f9
--- /dev/null
+++ b/tests/test_ivf_index.cpp
@@ -0,0 +1,242 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <map>
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_io.h>
+
+namespace {
+
+// stores all ivf lists, used to verify the context
+// object is passed to the iterator
+class TestContext {
+   public:
+    TestContext() {}
+
+    void save_code(size_t list_no, const uint8_t* code, size_t code_size) {
+        list_nos.emplace(id, list_no);
+        codes.emplace(id, std::vector<uint8_t>(code_size));
+        for (size_t i = 0; i < code_size; i++) {
+            codes[id][i] = code[i];
+        }
+        id++;
+    }
+
+    // id to codes map
+    std::unordered_map<faiss::idx_t, std::vector<uint8_t>> codes;
+    // id to list_no map
+    std::unordered_map<faiss::idx_t, size_t> list_nos;
+    faiss::idx_t id = 0;
+    std::set<size_t> lists_probed;
+};
+
+// the iterator that iterates over the codes stored in context object
+class TestInvertedListIterator : public faiss::InvertedListsIterator {
+   public:
+    TestInvertedListIterator(size_t list_no, TestContext* context)
+            : list_no{list_no}, context{context} {
+        it = context->codes.cbegin();
+        seek_next();
+    }
+    ~TestInvertedListIterator() override {}
+
+    // move the cursor to the first valid entry
+    void seek_next() {
+        while (it != context->codes.cend() &&
+               context->list_nos[it->first] != list_no) {
+            it++;
+        }
+    }
+
+    virtual bool is_available() const override {
+        return it != context->codes.cend();
+    }
+
+    virtual void next() override {
+        it++;
+        seek_next();
+    }
+
+    virtual std::pair<faiss::idx_t, const uint8_t*> get_id_and_codes()
+            override {
+        if (it == context->codes.cend()) {
+            FAISS_THROW_MSG("invalid state");
+        }
+        return std::make_pair(it->first, it->second.data());
+    }
+
+   private:
+    size_t list_no;
+    TestContext* context;
+    decltype(context->codes.cbegin()) it;
+};
+
+class TestInvertedLists : public faiss::InvertedLists {
+   public:
+    TestInvertedLists(size_t nlist, size_t code_size)
+            : faiss::InvertedLists(nlist, code_size) {
+        use_iterator = true;
+    }
+
+    ~TestInvertedLists() override {}
+    size_t list_size(size_t /*list_no*/) const override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+
+    faiss::InvertedListsIterator* get_iterator(size_t list_no, void* context)
+            const override {
+        auto testContext = (TestContext*)context;
+        testContext->lists_probed.insert(list_no);
+        return new TestInvertedListIterator(list_no, testContext);
+    }
+
+    const uint8_t* get_codes(size_t /* list_no */) const override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+
+    const faiss::idx_t* get_ids(size_t /* list_no */) const override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+
+    // store the codes in context object
+    size_t add_entry(
+            size_t list_no,
+            faiss::idx_t /*theid*/,
+            const uint8_t* code,
+            void* context) override {
+        auto testContext = (TestContext*)context;
+        testContext->save_code(list_no, code, code_size);
+        return 0;
+    }
+
+    size_t add_entries(
+            size_t /*list_no*/,
+            size_t /*n_entry*/,
+            const faiss::idx_t* /*ids*/,
+            const uint8_t* /*code*/) override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+
+    void update_entries(
+            size_t /*list_no*/,
+            size_t /*offset*/,
+            size_t /*n_entry*/,
+            const faiss::idx_t* /*ids*/,
+            const uint8_t* /*code*/) override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+
+    void resize(size_t /*list_no*/, size_t /*new_size*/) override {
+        FAISS_THROW_MSG("unexpected call");
+    }
+};
+} // namespace
+
+TEST(IVF, list_context) {
+    // this test verifies that the context object is passed
+    // to the InvertedListsIterator and InvertedLists::add_entry.
+    // the test InvertedLists and InvertedListsIterator reads/writes
+    // to the test context object.
+    // the test verifies the context object is modified as expected.
+
+    constexpr int d = 32;      // dimension
+    constexpr int nb = 100000; // database size
+    constexpr int nlist = 100;
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    // disable parallism, or we need to make Context object
+    // thread-safe
+    omp_set_num_threads(1);
+
+    faiss::IndexFlatL2 quantizer(d); // the other index
+    faiss::IndexIVFFlat index(&quantizer, d, nlist);
+    TestInvertedLists inverted_lists(nlist, index.code_size);
+    index.replace_invlists(&inverted_lists);
+    {
+        // training
+        constexpr size_t nt = 1500; // nb of training vectors
+        std::vector<float> trainvecs(nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = distrib(rng);
+        }
+        index.verbose = true;
+        index.train(nt, trainvecs.data());
+    }
+    TestContext context;
+    std::vector<float> query_vector;
+    constexpr faiss::idx_t query_vector_id = 100;
+    {
+        // populating the database
+        std::vector<float> database(nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = distrib(rng);
+            // populate the query vector
+            if (i >= query_vector_id * d && i < query_vector_id * d + d) {
+                query_vector.push_back(database[i]);
+            }
+        }
+        std::vector<faiss::idx_t> coarse_idx(nb);
+        index.quantizer->assign(nb, database.data(), coarse_idx.data());
+        // pass dummy ids, the acutal ids are assigned in TextContext object
+        std::vector<faiss::idx_t> xids(nb, 42);
+        index.add_core(
+                nb, database.data(), xids.data(), coarse_idx.data(), &context);
+
+        // check the context object get updated
+        EXPECT_EQ(nb, context.id) << "should have added all ids";
+        EXPECT_EQ(nb, context.codes.size())
+                << "should have correct number of codes";
+        EXPECT_EQ(nb, context.list_nos.size())
+                << "should have correct number of list numbers";
+    }
+    {
+        constexpr faiss::idx_t k = 100;
+        constexpr size_t nprobe = 10;
+        std::vector<float> distances(k);
+        std::vector<faiss::idx_t> labels(k);
+        faiss::SearchParametersIVF params;
+        params.inverted_list_context = &context;
+        params.nprobe = nprobe;
+        index.search(
+                1,
+                query_vector.data(),
+                k,
+                distances.data(),
+                labels.data(),
+                &params);
+        EXPECT_EQ(nprobe, context.lists_probed.size())
+                << "should probe nprobe lists";
+
+        // check the result contains the query vector, the probablity of
+        // this fail should be low
+        auto query_vector_listno = context.list_nos[query_vector_id];
+        auto& lists_probed = context.lists_probed;
+        EXPECT_TRUE(
+                std::find(
+                        lists_probed.cbegin(),
+                        lists_probed.cend(),
+                        query_vector_listno) != lists_probed.cend())
+                << "should probe the list of the query vector";
+        EXPECT_TRUE(
+                std::find(labels.cbegin(), labels.cend(), query_vector_id) !=
+                labels.cend())
+                << "should return the query vector";
+    }
+}
diff --git a/tests/test_ivflib.py b/tests/test_ivflib.py
index f19c3da45b..0a3fb8c87e 100644
--- a/tests/test_ivflib.py
+++ b/tests/test_ivflib.py
@@ -125,7 +125,6 @@ def test_range_search_with_parameters(self):
 
         Dpre, _ = index.search(xq, 15)
         radius = float(np.median(Dpre[:, -1]))
-        print("Radius=", radius)
         stats = faiss.cvar.indexIVF_stats
         stats.reset()
         Lref, Dref, Iref = index.range_search(xq, radius)
diff --git a/tests/test_local_search_quantizer.py b/tests/test_local_search_quantizer.py
index 01fec70ccf..7975929811 100644
--- a/tests/test_local_search_quantizer.py
+++ b/tests/test_local_search_quantizer.py
@@ -196,7 +196,6 @@ def test_update_codebooks_with_double(self):
         err_float = eval_codec(lsq, xb)
 
         # 6533.377 vs 25457.99
-        print(err_double, err_float)
         self.assertLess(err_double, err_float)
 
     def test_compute_binary_terms(self):
@@ -348,7 +347,6 @@ def test_training(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_lsq, err_pq)
         self.assertLess(err_lsq, err_pq)
 
 
@@ -463,7 +461,6 @@ def eval_index_accuracy(self, factory_key):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         inters = np.array(inters)
@@ -528,7 +525,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_plsq, err_pq)
         self.assertLess(err_plsq, err_pq)
 
     def test_with_lsq(self):
@@ -549,7 +545,6 @@ def test_with_lsq(self):
         lsq.train(xt)
         err_lsq = eval_codec(lsq, xb)
 
-        print(err_plsq, err_lsq)
         self.assertEqual(err_plsq, err_lsq)
 
     def test_lut(self):
@@ -664,7 +659,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as LSQ."""
         inter1 = self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF32,LSQ4x5_Nqint8")
-        # print(inter1, inter2)  # 381 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp
index e28e2a946f..7ce90a1d2d 100644
--- a/tests/test_lowlevel_ivf.cpp
+++ b/tests/test_lowlevel_ivf.cpp
@@ -364,22 +364,9 @@ void test_lowlevel_access_binary(const char* index_key) {
             }
         }
 
-        printf("new before reroder: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // re-order heap
         heap_reorder<CMax<int32_t, idx_t>>(k, D.data(), I.data());
 
-        printf("ref: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I_ref[j], D_ref[j]);
-        printf("]\nnew: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // check that we have the same results as the reference search
         for (int j = 0; j < k; j++) {
             // here the order is not guaranteed to be the same
diff --git a/tests/test_merge.cpp b/tests/test_merge.cpp
index 7e23f15f72..edbe2a03a6 100644
--- a/tests/test_merge.cpp
+++ b/tests/test_merge.cpp
@@ -6,47 +6,22 @@
  */
 
 #include <cstdio>
-#include <cstdlib>
 #include <random>
 
-#include <unistd.h>
-
 #include <gtest/gtest.h>
 
 #include <faiss/IVFlib.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/invlists/OnDiskInvertedLists.h>
 
-namespace {
-
-struct Tempfilename {
-    static pthread_mutex_t mutex;
-
-    std::string filename = "/tmp/faiss_tmp_XXXXXX";
+#include "test_util.h"
 
-    Tempfilename() {
-        pthread_mutex_lock(&mutex);
-        int fd = mkstemp(&filename[0]);
-        close(fd);
-        pthread_mutex_unlock(&mutex);
-    }
-
-    ~Tempfilename() {
-        if (access(filename.c_str(), F_OK)) {
-            unlink(filename.c_str());
-        }
-    }
-
-    const char* c_str() {
-        return filename.c_str();
-    }
-};
+namespace {
 
-pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 typedef faiss::idx_t idx_t;
 
@@ -57,6 +32,7 @@ size_t nq = 100;
 int nindex = 4;
 int k = 10;
 int nlist = 40;
+int shard_size = nb / nindex;
 
 struct CommonData {
     std::vector<float> database;
@@ -95,7 +71,7 @@ int compare_merged(
     std::vector<float> refD(k * nq);
 
     index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     std::vector<idx_t> newI(k * nq);
     std::vector<float> newD(k * nq);
@@ -125,7 +101,7 @@ int compare_merged(
         auto il = new faiss::OnDiskInvertedLists(
                 index0->nlist, index0->code_size, filename.c_str());
 
-        il->merge_from(lists.data(), lists.size());
+        il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
 
         index0->replace_invlists(il, true);
         index0->ntotal = ntotal;
@@ -135,11 +111,14 @@ int compare_merged(
             nq, cd.queries.data(), k, newD.data(), newI.data());
 
     size_t ndiff = 0;
+    bool adjust_ids = shift_ids && !standard_merge;
     for (size_t i = 0; i < k * nq; i++) {
-        if (refI[i] != newI[i]) {
+        idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
+        if (refI[i] != new_id) {
             ndiff++;
         }
     }
+
     return ndiff;
 }
 
@@ -212,7 +191,7 @@ TEST(MERGE, merge_flat_vt) {
 TEST(MERGE, merge_flat_ondisk) {
     faiss::IndexShards index_shards(d, false, false);
     index_shards.own_indices = true;
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     for (int i = 0; i < nindex; i++) {
         auto ivf = new faiss::IndexIVFFlat(&cd.quantizer, d, nlist);
@@ -245,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) {
     int ndiff = compare_merged(&index_shards, false, false);
     EXPECT_GE(0, ndiff);
 }
+
+// now use ondisk specific merge and use shift ids
+TEST(MERGE, merge_flat_ondisk_3) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_indices = true;
+
+    std::vector<idx_t> ids;
+    for (int i = 0; i < nb; ++i) {
+        int id = i % shard_size;
+        ids.push_back(id);
+    }
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard(
+                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), ids.data());
+    int ndiff = compare_merged(&index_shards, true, false);
+    EXPECT_GE(0, ndiff);
+}
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 8c4c1f0912..bdcc813f1c 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -72,7 +72,6 @@ def do_test_merge(self, index_type):
             index.merge_from(indexes[i], index.ntotal)
 
         _D, I = index.search(xq, k)
-        print(I[:5, :6])
 
         ndiff = (I != Iref).sum()
         print('%d / %d differences' % (ndiff, nq * k))
@@ -246,19 +245,45 @@ def test_merge_IDMap2(self):
 
 class TestRemoveFastScan(unittest.TestCase):
 
-    def do_fast_scan_test(self, factory_key, size1):
+    def do_fast_scan_test(self,
+                          factory_key,
+                          with_ids=False,
+                          direct_map_type=faiss.DirectMap.NoMap):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index1 = faiss.index_factory(ds.d, factory_key)
-        index1.train(ds.get_train())
-        index1.reset()
+        index = faiss.index_factory(ds.d, factory_key)
+        index.train(ds.get_train())
+
+        index.reset()
         tokeep = [i % 3 == 0 for i in range(ds.nb)]
-        index1.add(ds.get_database()[tokeep])
-        _, Iref = index1.search(ds.get_queries(), 5)
-        index1.reset()
-        index1.add(ds.get_database())
-        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
-        _, Inew = index1.search(ds.get_queries(), 5)
+        if with_ids:
+            index.add_with_ids(ds.get_database()[tokeep], np.arange(ds.nb)[tokeep])
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database()[tokeep])
+        _, Iref = index.search(ds.get_queries(), 5)
+
+        index.reset()
+        if with_ids:
+            index.add_with_ids(ds.get_database(), np.arange(ds.nb))
+            index.set_direct_map_type(direct_map_type)
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database())
+        index.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
-    def test_remove(self):
-        self.do_fast_scan_test("PQ5x4fs", 320)
+    def test_remove_PQFastScan(self):
+        # with_ids is not support for this type of index
+        self.do_fast_scan_test("PQ5x4fs", False)
+
+    def test_remove_IVFPQFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", True)
+
+    def test_remove_IVFPQFastScan_2(self):
+        self.assertRaisesRegex(Exception,
+                               ".*not supported.*",
+                               self.do_fast_scan_test,
+                               "IVF20,PQ5x4fs",
+                               True,
+                               faiss.DirectMap.Hashtable)
diff --git a/tests/test_meta_index.py b/tests/test_meta_index.py
index d53cad48f7..d0896e8ba2 100644
--- a/tests/test_meta_index.py
+++ b/tests/test_meta_index.py
@@ -82,10 +82,8 @@ def test_shards(self):
         k = 32
         ref_index = faiss.IndexFlatL2(d)
 
-        print('ref search')
         ref_index.add(xb)
         _Dref, Iref = ref_index.search(xq, k)
-        print(Iref[:5, :6])
 
         shard_index = faiss.IndexShards(d)
         shard_index_2 = faiss.IndexShards(d, True, False)
@@ -109,7 +107,6 @@ def test_shards(self):
         for test_no in range(3):
             with_threads = test_no == 1
 
-            print('shard search test_no = %d' % test_no)
             if with_threads:
                 remember_nt = faiss.omp_get_max_threads()
                 faiss.omp_set_num_threads(1)
@@ -122,14 +119,10 @@ def test_shards(self):
             else:
                 _D, I = shard_index_2.search(xq, k)
 
-            print(I[:5, :6])
-
             if with_threads:
                 faiss.omp_set_num_threads(remember_nt)
 
             ndiff = (I != Iref).sum()
-
-            print('%d / %d differences' % (ndiff, nq * k))
             assert (ndiff < nq * k / 1000.)
 
     def test_shards_ivf(self):
diff --git a/tests/test_ondisk_ivf.cpp b/tests/test_ondisk_ivf.cpp
index 94c23381eb..7c41e082f8 100644
--- a/tests/test_ondisk_ivf.cpp
+++ b/tests/test_ondisk_ivf.cpp
@@ -92,7 +92,7 @@ TEST(ONDISK, make_invlists) {
         }
     }
     EXPECT_EQ(ntot, nadd);
-};
+}
 
 TEST(ONDISK, test_add) {
     int d = 8;
@@ -155,7 +155,7 @@ TEST(ONDISK, test_add) {
 
         delete index3;
     }
-};
+}
 
 // WARN this thest will run multithreaded only in opt mode
 TEST(ONDISK, make_invlists_threaded) {
@@ -204,4 +204,4 @@ TEST(ONDISK, make_invlists_threaded) {
         }
     }
     EXPECT_EQ(ntot, nadd);
-};
+}
diff --git a/tests/test_partition.py b/tests/test_partition.py
index 02de7e8c2c..fd41eabe1f 100644
--- a/tests/test_partition.py
+++ b/tests/test_partition.py
@@ -49,7 +49,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -95,7 +94,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -148,7 +146,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
 
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -160,7 +157,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         tab_a = faiss.AlignedTableUint16()
         faiss.copy_array_to_AlignedTable(vals, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
@@ -196,7 +192,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -209,7 +204,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         vals_inv = (65535 - vals).astype('uint16')
         faiss.copy_array_to_AlignedTable(vals_inv, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
diff --git a/tests/test_product_quantizer.py b/tests/test_product_quantizer.py
index 1cdee7f144..f531cab2a1 100644
--- a/tests/test_product_quantizer.py
+++ b/tests/test_product_quantizer.py
@@ -26,7 +26,6 @@ def test_pq(self):
         x2 = pq.decode(codes)
         diff = ((x - x2)**2).sum()
 
-        # print("diff=", diff)
         # diff= 4418.0562
         self.assertGreater(5000, diff)
 
@@ -71,7 +70,6 @@ def do_test_codec(self, nbit):
 
     def test_codec(self):
         for i in range(16):
-            print("Testing nbits=%d" % (i + 1))
             self.do_test_codec(i + 1)
 
 
diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py
index e37ee3efe2..f4381607e1 100644
--- a/tests/test_residual_quantizer.py
+++ b/tests/test_residual_quantizer.py
@@ -211,7 +211,6 @@ def test_training(self):
 
         # in practice RQ is often better than PQ but it does not the case here, so just check
         # that we are within some factor.
-        # print(err_pq, err_rq)
         self.assertLess(err_rq, err_pq * 1.2)
 
     def test_beam_size(self):
@@ -321,10 +320,8 @@ def retrain_AQ_codebook(index, xt):
 
     x_decoded = index.sa_decode(codes_packed)
     MSE = ((xt - x_decoded) ** 2).sum() / n
-    # print(f"Initial MSE on training set: {MSE:g}")
 
     codes = unpack_codes(index.rq, codes_packed)
-    # print("ref codes", codes[0])
     codebook_offsets = faiss.vector_to_array(rq.codebook_offsets)
 
     # build sparse code matrix (represented as a dense matrix)
@@ -343,7 +340,6 @@ def retrain_AQ_codebook(index, xt):
         B, residuals, rank, singvals = scipy.linalg.lstsq(C, xt, )
 
     MSE = ((C @ B - xt) ** 2).sum() / n
-    # print(f"MSE after retrainining: {MSE:g}")
 
     # replace codebook
     # faiss.copy_array_to_vector(B.astype('float32').ravel(), index.rq.codebooks)
@@ -503,7 +499,6 @@ def test_reestimate_codebook_2(self):
         xt_decoded = ir.sa_decode(ir.sa_encode(xt))
         err_after_refined = ((xt - xt_decoded) ** 2).sum()
 
-        # print(err_before, err_after_refined)
         # ref run 7474.98 / 7006.1777
         self.assertGreater(err_before, err_after_refined * 1.06)
 
@@ -781,7 +776,6 @@ def test_search_L2(self):
             else:
                 inter_2 = faiss.eval_intersection(I2, gt)
                 self.assertGreaterEqual(inter_ref, inter_2)
-                # print(st, inter_ref, inter_2)
 
 
 ###########################################################
@@ -814,7 +808,6 @@ def do_test_accuracy(self, by_residual, st):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print(st, "nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         # do a little I/O test
@@ -909,18 +902,13 @@ def do_test_accuracy_IP(self, by_residual):
             D, I = index.search(ds.get_queries(), 10)
             index.rq.search_type = faiss.AdditiveQuantizer.ST_LUT_nonorm
             D2, I2 = index.search(ds.get_queries(), 10)
-            # print(D[:5] - D2[:5])
-            # print(I[:5])
             np.testing.assert_array_almost_equal(D, D2, decimal=5)
             # there are many ties because the codes are so short
             self.assertLess((I != I2).sum(), I.size * 0.1)
 
             # D2, I2 = index2.search(ds.get_queries(), 10)
-            # print(D[:5])
-            # print(D2[:5])
 
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
         self.assertTrue(np.all(inters[1:4] >= inters[:3]))
 
@@ -979,8 +967,6 @@ def beam_search_encode_step_tab(codes, L, distances, codebook_cross_prods_i,
             for b in range(beam_size):
                 dotprods[i, b, :] += cb[codes[i, b, j]]
 
-    # print("dps", dotprods[:3, :2, :4])
-
     new_distances += 2 * dotprods
     cent_distances = new_distances
 
@@ -1166,7 +1152,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        # print(err_prq, err_pq)
         self.assertLess(err_prq, err_pq)
 
     def test_with_rq(self):
@@ -1187,7 +1172,6 @@ def test_with_rq(self):
         rq.train(xt)
         err_rq = eval_codec(rq, xb)
 
-        # print(err_prq, err_rq)
         self.assertEqual(err_prq, err_rq)
 
 
@@ -1271,7 +1255,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as RQ."""
         inter1 = self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF100,RQ4x5_Nqint8")
-        # print(inter1, inter2)  # 392 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_rowwise_minmax.py b/tests/test_rowwise_minmax.py
index dbd14de388..53e6c00b15 100644
--- a/tests/test_rowwise_minmax.py
+++ b/tests/test_rowwise_minmax.py
@@ -45,7 +45,6 @@ def compare_train_vs_train_inplace(self, factory_key):
 
         # make sure that the reconstruction error is not crazy
         reconstruction_err = ((x - decoded) ** 2).sum()
-        print(reconstruction_err)
 
         self.assertLess(reconstruction_err, 0.6)
 
diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 8d3e42a49d..886ffc0c62 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -22,7 +22,7 @@ class TestSelector(unittest.TestCase):
     combinations as possible.
     """
 
-    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2):
+    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10):
         """ Verify that the id selector returns the subset of results that are
         members according to the IDSelector.
         Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
@@ -30,7 +30,6 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
         ds = datasets.SyntheticDataset(32, 1000, 100, 20)
         index = faiss.index_factory(ds.d, index_key, mt)
         index.train(ds.get_train())
-        k = 10
 
         # reference result
         if "range" in id_selector_type:
@@ -145,6 +144,16 @@ def test_IVFFlat_range_sorted(self):
     def test_IVFPQ(self):
         self.do_test_id_selector("IVF32,PQ4x4np")
 
+    def test_IVFPQfs(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs")
+
+    def test_IVFPQfs_k1(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=1)
+
+    def test_IVFPQfs_k40(self):
+        # test reservoir codepath
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=40)
+
     def test_IVFSQ(self):
         self.do_test_id_selector("IVF32,SQ8")
 
@@ -257,6 +266,24 @@ def test_idmap(self):
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
 
+    def test_bounds(self):
+        # https://github.com/facebookresearch/faiss/issues/3156
+        d = 64  # dimension
+        nb = 100000  # database size
+        xb = np.random.random((nb, d))
+        index_ip = faiss.IndexFlatIP(d)
+        index_ip.add(xb)
+        index_l2 = faiss.IndexFlatIP(d)
+        index_l2.add(xb)
+
+        out_of_bounds_id = nb + 15  # + 14 or lower will work fine
+        id_selector = faiss.IDSelectorArray([out_of_bounds_id])
+        search_params = faiss.SearchParameters(sel=id_selector)
+
+        # ignores out of bound, does not crash
+        distances, indices = index_ip.search(xb[:2], k=3, params=search_params)
+        distances, indices = index_l2.search(xb[:2], k=3, params=search_params)
+
 
 class TestSearchParams(unittest.TestCase):
 
@@ -438,7 +465,6 @@ def test_12_92(self):
         sp = faiss.swig_ptr
         selr.find_sorted_ids_bounds(
             len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
-        print(j01)
         assert j01[0] >= j01[1]
 
 
diff --git a/tests/test_sliding_ivf.cpp b/tests/test_sliding_ivf.cpp
index ea9e53d6b5..0214dd72e8 100644
--- a/tests/test_sliding_ivf.cpp
+++ b/tests/test_sliding_ivf.cpp
@@ -74,8 +74,6 @@ void make_index_slices(
     for (int i = 0; i < total_size; i++) {
         sub_indexes.emplace_back(clone_index(trained_index));
 
-        printf("preparing sub-index # %d\n", i);
-
         Index* index = sub_indexes.back().get();
 
         auto xb = make_data(nb * d);
@@ -122,13 +120,10 @@ int test_sliding_window(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         window.step(
                 i < total_size ? sub_indexes[i].get() : nullptr,
                 i >= window_size);
-        printf("   current n_slice = %d\n", window.n_slice);
 
         auto new_res = search_index(index.get(), xq.data());
 
@@ -159,8 +154,6 @@ int test_sliding_invlists(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         std::vector<const InvertedLists*> ils;
         for (int j = i - window_size + 1; j <= i; j++) {
@@ -178,8 +171,6 @@ int test_sliding_invlists(const char* index_key) {
         // will be deleted by the index
         index_ivf->replace_invlists(ci, true);
 
-        printf("   nb invlists = %zd\n", ils.size());
-
         auto new_res = search_index(index.get(), xq.data());
 
         std::unique_ptr<Index> merged_index(
@@ -188,13 +179,6 @@ int test_sliding_invlists(const char* index_key) {
         auto ref_res = search_index(merged_index.get(), xq.data());
 
         EXPECT_EQ(ref_res.size(), new_res.size());
-
-        size_t ndiff = 0;
-        for (size_t j = 0; j < ref_res.size(); j++) {
-            if (ref_res[j] != new_res[j])
-                ndiff++;
-        }
-        printf("  nb differences: %zd / %zd\n", ndiff, ref_res.size());
         EXPECT_EQ(ref_res, new_res);
     }
     return 0;
diff --git a/tests/test_standalone_codec.py b/tests/test_standalone_codec.py
index 7fdcf6849f..391b88b9dd 100644
--- a/tests/test_standalone_codec.py
+++ b/tests/test_standalone_codec.py
@@ -151,7 +151,6 @@ def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
             err = ((x - x2) ** 2).sum()
             errs.append(err)
 
-        print(errs)
         self.assertGreater(errs[0], errs[1])
 
         self.assertGreater(max_errs[0], errs[0])
@@ -174,6 +173,9 @@ def test_SQ2(self):
     def test_SQ3(self):
         self.compare_accuracy('SQ8', 'SQfp16')
 
+    def test_SQ4(self):
+        self.compare_accuracy('SQ8', 'SQbf16')
+
     def test_PQ(self):
         self.compare_accuracy('PQ6x8np', 'PQ8x8np')
 
@@ -214,7 +216,6 @@ def test_repeats(self):
             code = repeats.encode(swig_ptr(vec))
             vec2 = np.zeros(dim, dtype='float32')
             repeats.decode(code, swig_ptr(vec2))
-            # print(vec2)
             assert np.all(vec == vec2)
 
     def test_ZnSphereCodec_encode_centroid(self):
@@ -222,7 +223,6 @@ def test_ZnSphereCodec_encode_centroid(self):
         r2 = 5
         ref_codec = faiss.ZnSphereCodec(dim, r2)
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print(ref_codec.nv, codec.nv)
         assert ref_codec.nv == codec.nv
         s = set()
         for i in range(ref_codec.nv):
@@ -237,7 +237,6 @@ def test_ZnSphereCodecRec(self):
         dim = 16
         r2 = 6
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print("nv=", codec.nv)
         for i in range(codec.nv):
             c = np.zeros(dim, dtype='float32')
             codec.decode(i, swig_ptr(c))
@@ -300,15 +299,10 @@ def test_rw(self):
         for i in range(nbyte):
             self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
 
-        #for i in range(nbyte):
-        #    print(bin(bs[i] + 256)[3:], end=' ')
-        # print()
-
         br = faiss.BitstringReader(swig_ptr(bs), nbyte)
 
         for nbit, xref in ctrl:
             xnew = br.read(nbit)
-            # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
     def test_arrays(self):
diff --git a/tests/test_threaded_index.cpp b/tests/test_threaded_index.cpp
index b10d3806e0..3dc2660d9e 100644
--- a/tests/test_threaded_index.cpp
+++ b/tests/test_threaded_index.cpp
@@ -169,7 +169,7 @@ TEST(ThreadedIndex, TestReplica) {
     int k = 6;
 
     // Try with threading and without
-    for (bool threaded : {true, false}) {
+    for ([[maybe_unused]] const bool threaded : {true, false}) {
         std::vector<std::unique_ptr<MockIndex>> idxs;
         faiss::IndexReplicas replica(d);
 
@@ -247,8 +247,6 @@ TEST(ThreadedIndex, TestShards) {
         shards.search(n, x.data(), k, distances.data(), labels.data());
 
         for (int i = 0; i < idxs.size(); ++i) {
-            auto perShard = n / idxs.size();
-
             EXPECT_EQ(idxs[i]->nCalled, n);
             EXPECT_EQ(idxs[i]->xCalled, x.data());
             EXPECT_EQ(idxs[i]->kCalled, k);
diff --git a/tests/test_util.h b/tests/test_util.h
new file mode 100644
index 0000000000..3be0e35cff
--- /dev/null
+++ b/tests/test_util.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_TEST_UTIL_H
+#define FAISS_TEST_UTIL_H
+
+#include <faiss/IndexIVFPQ.h>
+#include <unistd.h>
+#include <cstdlib>
+
+struct Tempfilename {
+    pthread_mutex_t* mutex;
+    std::string filename;
+
+    Tempfilename(pthread_mutex_t* mutex, std::string filename) {
+        this->mutex = mutex;
+        this->filename = filename;
+        pthread_mutex_lock(mutex);
+        int fd = mkstemp(&filename[0]);
+        close(fd);
+        pthread_mutex_unlock(mutex);
+    }
+
+    ~Tempfilename() {
+        if (access(filename.c_str(), F_OK)) {
+            unlink(filename.c_str());
+        }
+    }
+
+    const char* c_str() {
+        return filename.c_str();
+    }
+};
+
+#endif // FAISS_TEST_UTIL_H
diff --git a/tutorial/cpp/1-Flat.cpp b/tutorial/cpp/1-Flat.cpp
index 819e419573..147fa89bc0 100644
--- a/tutorial/cpp/1-Flat.cpp
+++ b/tutorial/cpp/1-Flat.cpp
@@ -83,10 +83,10 @@ int main() {
             printf("\n");
         }
 
-        printf("I (5 last results)=\n");
+        printf("D (5 last results)=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/2-IVFFlat.cpp b/tutorial/cpp/2-IVFFlat.cpp
index febd5be049..86530ae985 100644
--- a/tutorial/cpp/2-IVFFlat.cpp
+++ b/tutorial/cpp/2-IVFFlat.cpp
@@ -61,13 +61,10 @@ int main() {
             printf("\n");
         }
 
-        index.nprobe = 10;
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
+        printf("D=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/6-HNSW.cpp b/tutorial/cpp/6-HNSW.cpp
new file mode 100644
index 0000000000..9bd8cd3faa
--- /dev/null
+++ b/tutorial/cpp/6-HNSW.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexHNSW.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[d * nb];
+    float* xq = new float[d * nq];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++)
+            xb[d * i + j] = distrib(rng);
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++)
+            xq[d * i + j] = distrib(rng);
+        xq[d * i] += i / 1000.;
+    }
+
+    int k = 4;
+
+    faiss::IndexHNSWFlat index(d, 32);
+    index.add(nb, xb);
+
+    { // search xq
+        idx_t* I = new idx_t[k * nq];
+        float* D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5zd ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("D=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5f ", D[i * k + j]);
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/7-PQFastScan.cpp b/tutorial/cpp/7-PQFastScan.cpp
new file mode 100644
index 0000000000..4cdfea052e
--- /dev/null
+++ b/tutorial/cpp/7-PQFastScan.cpp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.train(nb, xb);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.add(nb, xb);
+
+    int k = 4;
+
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+} // namespace facebook::detail
diff --git a/tutorial/cpp/8-PQFastScanRefine.cpp b/tutorial/cpp/8-PQFastScanRefine.cpp
new file mode 100644
index 0000000000..2435d94d2c
--- /dev/null
+++ b/tutorial/cpp/8-PQFastScanRefine.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    faiss::IndexRefineFlat index_refine(&index);
+    // refine index after PQFastScan
+
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.train(nb, xb);
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.add(nb, xb);
+
+    int k = 4;
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+        index_refine.search(nq, xq, k, D, I, params);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+        delete params;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/9-RefineComparison.cpp b/tutorial/cpp/9-RefineComparison.cpp
new file mode 100644
index 0000000000..d7fbc90aec
--- /dev/null
+++ b/tutorial/cpp/9-RefineComparison.cpp
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/index_factory.h>
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    // Constructing the refine PQ index with SQfp16 with index factory
+    faiss::Index* index_fp16;
+    index_fp16 = faiss::index_factory(
+            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
+    index_fp16->train(nb, xb);
+    index_fp16->add(nb, xb);
+
+    // Constructing the refine PQ index with SQ8
+    faiss::Index* index_sq8;
+    index_sq8 =
+            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
+    index_sq8->train(nb, xb);
+    index_sq8->add(nb, xb);
+
+    int k = 10;
+    { // search xq
+        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
+        float* D_fp16 = new float[(int)(k * nq)];
+        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
+        float* D_sq8 = new float[(int)(k * nq)];
+
+        // Parameterization on k factor while doing search for index refinement
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+
+        // Perform index search using different index refinement
+        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
+        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
+
+        printf("I_fp16=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_fp16[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        printf("I_sq8=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_sq8[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I_fp16;
+        delete[] D_fp16;
+        delete[] I_sq8;
+        delete[] D_sq8;
+        delete params;
+
+        delete index_fp16;
+        delete index_sq8;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index 7361b33a03..f964b3dda9 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -18,3 +18,15 @@ target_link_libraries(4-GPU PRIVATE faiss)
 
 add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
 target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
+
+add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
+target_link_libraries(6-HNSW PRIVATE faiss)
+
+add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
+target_link_libraries(7-PQFastScan PRIVATE faiss)
+
+add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
+target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
+
+add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
+target_link_libraries(9-RefineComparison PRIVATE faiss)
diff --git a/tutorial/python/7-PQFastScan.py b/tutorial/python/7-PQFastScan.py
new file mode 100644
index 0000000000..34d7a34ac1
--- /dev/null
+++ b/tutorial/python/7-PQFastScan.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8   # 8 specifies that the number of sub-vector is 8
+k = 4   # number of dimension in etracted vector
+n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
+# construct FastScan Index
+
+assert not index.is_trained
+index.train(xb)     # Train vectors data index within mockup database
+assert index.is_trained
+
+index.add(xb)
+D, I = index.search(xb[:5], k)  # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])               # neighbors of the 5 last queries
diff --git a/tutorial/python/8-PQFastScanRefine.py b/tutorial/python/8-PQFastScanRefine.py
new file mode 100644
index 0000000000..115a036fa7
--- /dev/null
+++ b/tutorial/python/8-PQFastScanRefine.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8  # 8 specifies that the number of sub-vector is 8
+k = 4  # number of dimension in etracted vector
+n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
+
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
+index_refine = faiss.IndexRefineFlat(index)
+# construct FastScan and run index refinement
+
+assert not index_refine.is_trained
+index_refine.train(xb)  # Train vectors data index within mockup database
+assert index_refine.is_trained
+
+index_refine.add(xb)
+params = faiss.IndexRefineSearchParameters(k_factor=3)
+D, I = index_refine.search(xq[:5], 10, params=params)
+print(I)
+print(D)
+index.nprobe = 10  # make comparable with experiment above
+D, I = index.search(xq[:5], k)  # search
+print(I[-5:])
diff --git a/tutorial/python/9-RefineComparison.py b/tutorial/python/9-RefineComparison.py
new file mode 100644
index 0000000000..6fa69f33d9
--- /dev/null
+++ b/tutorial/python/9-RefineComparison.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+
+from faiss.contrib.evaluation import knn_intersection_measure
+from faiss.contrib import datasets
+
+# 64-dim vectors, 50000 vectors in the training, 100000 in database,
+# 10000 in queries, dtype ('float32')
+ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
+d = 64                           # dimension
+
+# Constructing the refine PQ index with SQfp16 with index factory
+index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
+index_fp16.train(ds.get_train())
+index_fp16.add(ds.get_database())
+
+# Constructing the refine PQ index with SQ8
+index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
+index_sq8.train(ds.get_train())
+index_sq8.add(ds.get_database())
+
+# Parameterization on k factor while doing search for index refinement
+k_factor = 3.0
+params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
+
+# Perform index search using different index refinement
+D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
+D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
+
+# Calculating knn intersection measure for different index types on refinement
+KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
+KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
+
+# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
+assert (KIM_fp16 > KIM_sq8)
+
+print(I_sq8[:5])
+print(I_fp16[:5])