From a413963e56c344450d8933bd1797181ca8dcfe71 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Mon, 24 Jun 2024 17:34:38 -0400
Subject: [PATCH] propagate faiss changes from (24 Jun 2024)

Signed-off-by: Alexandr Guzhva <alexanderguzhva@gmail.com>
---
 tests/faiss/CMakeLists.txt                    |    2 +
 thirdparty/faiss/.circleci/config.yml         |  428 +------
 .../.github/actions/build_cmake/action.yml    |  105 ++
 .../.github/actions/build_conda/action.yml    |   96 ++
 thirdparty/faiss/.github/workflows/build.yml  |  244 ++++
 .../faiss/.github/workflows/nightly.yml       |  139 +++
 .../benchs/bench_cppcontrib_sa_decode.cpp     |  175 +--
 thirdparty/faiss/benchs/bench_fw/benchmark.py |  764 +++++++++---
 .../faiss/benchs/bench_fw/benchmark_io.py     |    6 +-
 .../faiss/benchs/bench_fw/descriptors.py      |  216 +++-
 thirdparty/faiss/benchs/bench_fw/index.py     |  123 +-
 thirdparty/faiss/benchs/bench_fw/optimize.py  |   18 +-
 thirdparty/faiss/benchs/bench_fw_codecs.py    |   10 +-
 thirdparty/faiss/benchs/bench_fw_ivf.py       |   31 +-
 .../faiss/benchs/bench_fw_notebook.ipynb      | 1059 +++++++++--------
 thirdparty/faiss/benchs/bench_fw_optimize.py  |    6 +-
 thirdparty/faiss/benchs/bench_fw_range.py     |   20 +-
 .../faiss/c_api/IndexScalarQuantizer_c.h      |    3 +
 .../faiss/conda/faiss-gpu-raft/meta.yaml      |   17 +-
 thirdparty/faiss/conda/faiss-gpu/build-lib.sh |    6 +
 thirdparty/faiss/conda/faiss-gpu/meta.yaml    |    6 +-
 thirdparty/faiss/conda/faiss/meta.yaml        |    4 +-
 thirdparty/faiss/contrib/datasets.py          |    6 +-
 thirdparty/faiss/contrib/factory_tools.py     |    3 +
 thirdparty/faiss/contrib/vecs_io.py           |    8 +-
 thirdparty/faiss/faiss/IndexFlat.cpp          |   14 +-
 thirdparty/faiss/faiss/IndexHNSW.cpp          |  174 ++-
 thirdparty/faiss/faiss/IndexHNSW.h            |   44 +-
 thirdparty/faiss/faiss/IndexIVFFastScan.cpp   |    6 -
 thirdparty/faiss/faiss/IndexNNDescent.cpp     |   29 -
 .../faiss/faiss/IndexScalarQuantizer.cpp      |    4 +-
 thirdparty/faiss/faiss/MetricType.h           |    4 +
 thirdparty/faiss/faiss/gpu/GpuIcmEncoder.cu   |   12 +-
 .../faiss/faiss/impl/AuxIndexStructures.cpp   |   25 +
 .../faiss/faiss/impl/AuxIndexStructures.h     |    8 +
 .../faiss/faiss/impl/DistanceComputer.h       |   46 +
 thirdparty/faiss/faiss/impl/HNSW.cpp          |   72 +-
 thirdparty/faiss/faiss/impl/HNSW.h            |   12 +-
 thirdparty/faiss/faiss/impl/NNDescent.cpp     |   21 +-
 thirdparty/faiss/faiss/impl/NSG.cpp           |   29 -
 .../faiss/faiss/impl/ScalarQuantizer.cpp      |    7 +
 thirdparty/faiss/faiss/impl/ScalarQuantizer.h |    5 +-
 .../faiss/faiss/impl/ScalarQuantizerCodec.h   |   88 ++
 .../faiss/impl/ScalarQuantizerCodec_avx.h     |   81 ++
 .../faiss/impl/ScalarQuantizerCodec_avx512.h  |   93 ++
 .../faiss/impl/ScalarQuantizerCodec_neon.h    |  102 +-
 .../impl/code_distance/code_distance-avx2.h   |    5 +
 thirdparty/faiss/faiss/impl/index_read.cpp    |   20 +-
 thirdparty/faiss/faiss/impl/index_write.cpp   |   33 +-
 thirdparty/faiss/faiss/index_factory.cpp      |    6 +-
 thirdparty/faiss/faiss/index_io.h             |   11 +-
 .../faiss/faiss/invlists/InvertedLists.cpp    |   72 +-
 .../faiss/faiss/invlists/InvertedLists.h      |   27 +-
 thirdparty/faiss/faiss/utils/bf16.h           |   36 +
 .../faiss/faiss/utils/extra_distances-inl.h   |   32 +
 .../faiss/faiss/utils/extra_distances.cpp     |   60 +-
 .../faiss/faiss/utils/extra_distances.h       |    5 +-
 thirdparty/faiss/faiss/utils/simdlib_neon.h   |   10 +-
 thirdparty/faiss/tests/CMakeLists.txt         |    2 +
 thirdparty/faiss/tests/common_faiss_tests.py  |    1 -
 .../faiss/tests/test_binary_hashindex.py      |   10 -
 thirdparty/faiss/tests/test_build_blocks.py   |   15 -
 thirdparty/faiss/tests/test_callback.cpp      |   37 +
 thirdparty/faiss/tests/test_callback_py.py    |   32 +
 thirdparty/faiss/tests/test_clustering.py     |    3 -
 .../tests/test_common_ivf_empty_index.cpp     |  144 +++
 thirdparty/faiss/tests/test_contrib.py        |    2 -
 .../faiss/tests/test_contrib_with_scipy.py    |    2 -
 .../faiss/tests/test_extra_distances.py       |   27 +
 thirdparty/faiss/tests/test_fast_scan.py      |    3 -
 thirdparty/faiss/tests/test_graph_based.py    |   43 +-
 thirdparty/faiss/tests/test_index.py          |   11 +-
 thirdparty/faiss/tests/test_index_accuracy.py |   55 +-
 thirdparty/faiss/tests/test_index_binary.py   |    6 +-
 .../faiss/tests/test_index_composite.py       |    3 -
 thirdparty/faiss/tests/test_io.py             |    1 -
 thirdparty/faiss/tests/test_ivf_index.cpp     |    2 +
 thirdparty/faiss/tests/test_ivflib.py         |    1 -
 .../tests/test_local_search_quantizer.py      |    8 +-
 thirdparty/faiss/tests/test_merge_index.py    |    1 -
 thirdparty/faiss/tests/test_meta_index.py     |    7 -
 thirdparty/faiss/tests/test_partition.py      |    6 -
 .../faiss/tests/test_product_quantizer.py     |    2 -
 .../faiss/tests/test_residual_quantizer.py    |   17 -
 thirdparty/faiss/tests/test_rowwise_minmax.py |    1 -
 thirdparty/faiss/tests/test_search_params.py  |    1 -
 .../faiss/tests/test_standalone_codec.py      |   12 +-
 thirdparty/faiss/tutorial/cpp/1-Flat.cpp      |    4 +-
 thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp   |    7 +-
 thirdparty/faiss/tutorial/cpp/6-HNSW.cpp      |   73 ++
 .../faiss/tutorial/cpp/7-PQFastScan.cpp       |   75 ++
 .../faiss/tutorial/cpp/8-PQFastScanRefine.cpp |   84 ++
 .../faiss/tutorial/cpp/9-RefineComparison.cpp |  104 ++
 thirdparty/faiss/tutorial/cpp/CMakeLists.txt  |   12 +
 .../faiss/tutorial/python/7-PQFastScan.py     |   35 +
 .../tutorial/python/8-PQFastScanRefine.py     |   38 +
 .../tutorial/python/9-RefineComparison.py     |   42 +
 97 files changed, 3900 insertions(+), 1717 deletions(-)
 create mode 100644 thirdparty/faiss/.github/actions/build_cmake/action.yml
 create mode 100644 thirdparty/faiss/.github/actions/build_conda/action.yml
 create mode 100644 thirdparty/faiss/.github/workflows/build.yml
 create mode 100644 thirdparty/faiss/.github/workflows/nightly.yml
 create mode 100644 thirdparty/faiss/faiss/utils/bf16.h
 create mode 100644 thirdparty/faiss/tests/test_callback.cpp
 create mode 100644 thirdparty/faiss/tests/test_callback_py.py
 create mode 100644 thirdparty/faiss/tests/test_common_ivf_empty_index.cpp
 create mode 100644 thirdparty/faiss/tutorial/cpp/6-HNSW.cpp
 create mode 100644 thirdparty/faiss/tutorial/cpp/7-PQFastScan.cpp
 create mode 100644 thirdparty/faiss/tutorial/cpp/8-PQFastScanRefine.cpp
 create mode 100644 thirdparty/faiss/tutorial/cpp/9-RefineComparison.cpp
 create mode 100644 thirdparty/faiss/tutorial/python/7-PQFastScan.py
 create mode 100644 thirdparty/faiss/tutorial/python/8-PQFastScanRefine.py
 create mode 100644 thirdparty/faiss/tutorial/python/9-RefineComparison.py

diff --git a/tests/faiss/CMakeLists.txt b/tests/faiss/CMakeLists.txt
index dd15ddc2c..e4e0c6c2e 100644
--- a/tests/faiss/CMakeLists.txt
+++ b/tests/faiss/CMakeLists.txt
@@ -26,6 +26,8 @@ set(FAISS_TEST_SRCS
   ../../thirdparty/faiss/tests/test_fastscan_perf.cpp
   ../../thirdparty/faiss/tests/test_ivf_index.cpp
   ../../thirdparty/faiss/tests/test_disable_pq_sdc_tables.cpp
+  ../../thirdparty/faiss/tests/test_common_ivf_empty_index.cpp
+  ../../thirdparty/faiss/tests/test_callback.cpp
   )
 
 find_package(GTest REQUIRED)
diff --git a/thirdparty/faiss/.circleci/config.yml b/thirdparty/faiss/.circleci/config.yml
index 549e4a279..033093915 100644
--- a/thirdparty/faiss/.circleci/config.yml
+++ b/thirdparty/faiss/.circleci/config.yml
@@ -5,185 +5,8 @@ executors:
     docker:
       - image: continuumio/miniconda3
     resource_class: large
-  linux-x86_64-gpu:
-    environment:
-      CONDA_ARCH: Linux-x86_64
-    machine:
-      image: linux-cuda-12:default
-    resource_class: gpu.nvidia.medium
-  linux-arm64-cpu:
-    environment:
-      CONDA_ARCH: Linux-aarch64
-    machine:
-      image: ubuntu-2204:current
-    resource_class: arm.medium
-  macosx-arm64-cpu:
-    environment:
-      CONDA_ARCH: MacOSX-arm64
-    macos:
-      xcode: 14.2.0 # minimum supported for M1
-    resource_class: macos.m1.large.gen1
-  windows-x86_64-cpu:
-    machine:
-      image: windows-server-2019-vs2019:2023.04.1
-      shell: bash.exe
-    resource_class: windows.medium
 
 jobs:
-  format:
-    docker:
-      - image: ubuntu:22.04
-    steps:
-      - checkout
-      - run:
-          name: Install clang-format
-          command: |
-            apt-get update
-            apt-get install -y git-core clang-format-11
-      - run:
-          name: Verify clang-format
-          command: |
-             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-11 -i
-             if git diff --quiet; then
-               echo "Formatting OK!"
-             else
-               echo "Formatting not OK!"
-               echo "------------------"
-               git --no-pager diff --color
-               exit 1
-             fi
-
-  build_conda:
-    parameters:
-      label:
-        type: string
-        default: ""
-      cuda:
-        type: string
-        default: ""
-      raft:
-        type: string
-        default: ""
-      cuda_archs:
-        type: string
-        default: ""
-      compiler_version:
-        type: string
-        default: ""
-      exec:
-        type: executor
-    executor: << parameters.exec >>
-    environment:
-      OMP_NUM_THREADS: 10
-      PACKAGE_TYPE: <<parameters.label>>
-      CUDA_ARCHS: <<parameters.cuda_archs>>
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            if [ -n "${CONDA_ARCH}" ]
-            then
-              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
-              bash miniconda.sh -b -p $HOME/miniconda
-              ~/miniconda/bin/conda init
-            fi
-      - run:
-          name: Install conda build tools
-          command: |
-            # conda config --set solver libmamba
-            # conda config --set verbosity 3
-            conda update -y -q conda
-            conda install -y -q conda-build
-      - when:
-          condition: << parameters.label >>
-          steps:
-            - run:
-                name: Enable anaconda uploads
-                command: |
-                  conda install -y -q anaconda-client
-                  conda config --set anaconda_upload yes
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU)
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --python 3.11 -c pytorch
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU) w/ anaconda upload
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
-
   build_cmake:
     parameters:
       exec:
@@ -191,12 +14,6 @@ jobs:
       opt_level:
         type: string
         default: generic
-      gpu:
-        type: string
-        default: "OFF"
-      raft:
-        type: string
-        default: "OFF"
     executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
@@ -217,32 +34,10 @@ jobs:
           command: |
             conda config --set solver libmamba
             conda update -y -q conda
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using main channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
-      - when:
-          condition:
-            equal: [ "ON", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using conda-forge channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
-      - when:
-          condition:
-            and:
-              - equal: [ "ON", << parameters.gpu >> ]
-              - equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install CUDA
-                command: |
-                  conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+      - run:
+          name: Install env using main channel
+          command: |
+            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
       - run:
           name: Build all targets
           no_output_timeout: 30m
@@ -252,8 +47,8 @@ jobs:
             cmake -B build \
                   -DBUILD_TESTING=ON \
                   -DBUILD_SHARED_LIBS=ON \
-                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
-                  -DFAISS_ENABLE_RAFT=<< parameters.raft >> \
+                  -DFAISS_ENABLE_GPU=OFF \
+                  -DFAISS_ENABLE_RAFT=OFF \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
                   -DPYTHON_EXECUTABLE=$(which python) \
@@ -272,38 +67,12 @@ jobs:
           command: |
             cd build/faiss/python
             python setup.py install
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU only)
-                command: |
-                  conda install -y -q pytorch -c pytorch
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - when:
-          condition:
-            equal: [ "ON", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU + GPU)
-                command: |
-                  conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-                  cp tests/common_faiss_tests.py faiss/gpu/test
-                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
-                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
-      - when:
-          condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
-          steps:
-            - run:
-                name: Test avx2 loading
-                command: |
-                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
-                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
+      - run:
+          name: Python tests (CPU only)
+          command: |
+            conda install -y -q pytorch -c pytorch
+            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
       - store_test_results:
           path: test-results
 
@@ -311,180 +80,7 @@ workflows:
   version: 2
   build:
     jobs:
-      - format:
-          name: Format
-      - build_cmake:
-          name: Linux x86_64 (cmake)
-          exec: linux-x86_64-cpu
-      - build_cmake:
-          name: Linux x86_64 AVX2 (cmake)
-          exec: linux-x86_64-cpu
-          opt_level: "avx2"
       - build_cmake:
           name: Linux x86_64 AVX512 (cmake)
           exec: linux-x86_64-cpu
           opt_level: "avx512"
-      - build_cmake:
-          name: Linux x86_64 GPU (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          requires:
-            - Linux x86_64 AVX2 (cmake)
-      - build_cmake:
-          name: Linux x86_64 GPU w/ RAFT (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          raft: "ON"
-          requires:
-            - Linux x86_64 GPU (cmake)
-      - build_conda:
-          name: Linux x86_64 (conda)
-          exec: linux-x86_64-cpu
-      - build_conda:
-          name: Windows x86_64 (conda)
-          exec: windows-x86_64-cpu
-      - build_conda:
-          name: Linux arm64 (conda)
-          exec: linux-arm64-cpu
-      - build_conda:
-          name: Linux x86_64 packages
-          exec: linux-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Windows x86_64 packages
-          exec: windows-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: OSX arm64 packages
-          exec: macosx-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux arm64 packages
-          exec: linux-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-
-  nightly:
-    triggers:
-      - schedule:
-          cron: "0 0 * * *"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - build_conda:
-          name: Linux x86_64 nightlies
-          exec: linux-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Windows x86_64 nightlies
-          exec: windows-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: OSX arm64 nightlies
-          exec: macosx-arm64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux arm64 nightlies
-          exec: linux-arm64-cpu
-          label: nightly
diff --git a/thirdparty/faiss/.github/actions/build_cmake/action.yml b/thirdparty/faiss/.github/actions/build_cmake/action.yml
new file mode 100644
index 000000000..2bc476add
--- /dev/null
+++ b/thirdparty/faiss/.github/actions/build_cmake/action.yml
@@ -0,0 +1,105 @@
+name: Build cmake
+inputs:
+  opt_level:
+    description: 'Compile options / optimization level.'
+    required: false
+    default: generic
+  gpu:
+    description: 'Enable GPU support.'
+    required: false
+    default: OFF
+  raft:
+    description: 'Enable RAFT support.'
+    required: false
+    default: OFF
+runs:
+  using: composite
+  steps:
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version: latest
+    - name: Configure build environment
+      shell: bash
+      run: |
+        # initialize Conda
+        conda config --set solver libmamba
+        conda update -y -q conda
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        # install base packages
+        conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
+
+        # install CUDA packages
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+        fi
+
+        # install RAFT packages
+        if [ "${{ inputs.raft }}" = "ON" ]; then
+          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        fi
+
+        # install test packages
+        conda install -y pytest
+        if [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        else
+          conda install -y -q pytorch -c pytorch
+        fi
+    - name: Build all targets
+      shell: bash
+      run: |
+        eval "$(conda shell.bash hook)"
+        conda activate
+        cmake -B build \
+              -DBUILD_TESTING=ON \
+              -DBUILD_SHARED_LIBS=ON \
+              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
+              -DFAISS_ENABLE_RAFT=${{ inputs.raft }} \
+              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
+              -DFAISS_ENABLE_C_API=ON \
+              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBLA_VENDOR=Intel10_64_dyn \
+              -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+              .
+        make -k -C build -j$(nproc)
+    - name: C++ tests
+      shell: bash
+      run: |
+        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
+        make -C build test
+    - name: Install Python extension
+      shell: bash
+      working-directory: build/faiss/python
+      run: |
+        $CONDA/bin/python setup.py install
+    - name: Python tests (CPU only)
+      if: inputs.gpu == 'OFF'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+    - name: Python tests (CPU + GPU)
+      if: inputs.gpu == 'ON'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+        cp tests/common_faiss_tests.py faiss/gpu/test
+        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+    - name: Test avx2 loading
+      if: inputs.opt_level == 'avx2'
+      shell: bash
+      run: |
+        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
+        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
+        path: test-results
diff --git a/thirdparty/faiss/.github/actions/build_conda/action.yml b/thirdparty/faiss/.github/actions/build_conda/action.yml
new file mode 100644
index 000000000..982430c35
--- /dev/null
+++ b/thirdparty/faiss/.github/actions/build_conda/action.yml
@@ -0,0 +1,96 @@
+name: Conda build
+description: Builds FAISS inside a Conda environment and uploads to repository when label is provided.
+inputs:
+  label:
+    description: "The label to be used for uploads to Conda."
+    default: ""
+    required: false
+  cuda:
+    description: "CUDA toolkit version to use."
+    default: ""
+    required: false
+  raft:
+    description: "Enable RAFT support."
+    default: ""
+    required: false
+  compiler_version:
+    description: "compiler_version"
+    default: "Compiler version for C/C++/CUDA."
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Choose shell
+      shell: bash
+      id: choose_shell
+      run: |
+        # Use pwsh on Windows; bash everywhere else
+        if [ "${{ runner.os }}" != "Windows" ]; then
+          echo "shell=bash" >> "$GITHUB_OUTPUT"
+        else
+          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version:  latest
+    - name: Install conda build tools
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        conda update -y -q conda
+        conda install -y -q conda-build
+    - name: Enable anaconda uploads
+      if: inputs.label != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda install -y -q anaconda-client
+        conda config --set anaconda_upload yes
+    - name: Conda build (CPU)
+      if: inputs.label == '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss --python 3.11 -c pytorch
+    - name: Conda build (CPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
+    - name: Conda build (GPU)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU w/ RAFT)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+    - name: Conda build (GPU w/ RAFT) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
diff --git a/thirdparty/faiss/.github/workflows/build.yml b/thirdparty/faiss/.github/workflows/build.yml
new file mode 100644
index 000000000..bd415dfce
--- /dev/null
+++ b/thirdparty/faiss/.github/workflows/build.yml
@@ -0,0 +1,244 @@
+name: Build
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    tags:
+      - 'v*'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  format:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install clang-format
+        run: |
+            sudo apt-get update -y
+            sudo apt-get install -y wget
+            sudo apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            sudo ./llvm.sh 18
+            sudo apt-get install -y git-core clang-format-18
+      - name: Verify clang-format
+        run: |
+            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
+            if git diff --quiet; then
+              echo "Formatting OK!"
+            else
+              echo "Formatting not OK!"
+              echo "------------------"
+              git --no-pager diff --color
+              exit 1
+            fi
+  linux-x86_64-cmake:
+    name: Linux x86_64 (cmake)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+  linux-x86_64-AVX2-cmake:
+    name: Linux x86_64 AVX2 (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx2
+  linux-x86_64-AVX512-cmake:
+    name: Linux x86_64 AVX512 (cmake)
+    if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512
+  linux-x86_64-GPU-cmake:
+    name: Linux x86_64 GPU (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+  linux-x86_64-GPU-w-RAFT-cmake:
+    name: Linux x86_64 GPU w/ RAFT (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          raft: ON
+  linux-x86_64-conda:
+    name: Linux x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  windows-x86_64-conda:
+    name: Windows x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-arm64-conda:
+    name: Linux arm64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-x86_64-packages:
+    name: Linux x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-x86_64-GPU-packages-CUDA-11-4-4:
+    name: Linux x86_64 GPU packages (CUDA 11.4.4)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-packages-CUDA-12-1-1:
+    name: Linux x86_64 GPU packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-packages:
+    name: Windows x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  osx-arm64-packages:
+    name: OSX arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-arm64-packages:
+    name: Linux arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
diff --git a/thirdparty/faiss/.github/workflows/nightly.yml b/thirdparty/faiss/.github/workflows/nightly.yml
new file mode 100644
index 000000000..eabee0774
--- /dev/null
+++ b/thirdparty/faiss/.github/workflows/nightly.yml
@@ -0,0 +1,139 @@
+name: Nightly
+on:
+  schedule:
+    - cron:  '10 1 * * *'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-nightly:
+    name: Linux x86_64 nightlies
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-x86_64-GPU-CUDA-11-4-4-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-CUDA-12-1-1-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA12-1-1-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-nightly:
+    name: Windows x86_64 nightlies
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  osx-arm64-nightly:
+    name: OSX arm64 nightlies
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-arm64-nightly:
+    name: Linux arm64 nightlies
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
diff --git a/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp b/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
index f0266172a..b960fb7c6 100644
--- a/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
@@ -213,9 +213,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -261,10 +261,9 @@ static void verifyIndex2LevelDecoder(
 
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -324,9 +323,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -353,9 +352,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -384,9 +383,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -418,9 +417,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -456,9 +455,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -524,9 +523,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -573,9 +572,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -641,9 +640,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -675,9 +674,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -711,9 +710,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -750,9 +749,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -793,9 +792,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -851,9 +850,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -899,9 +898,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -961,9 +960,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -989,9 +988,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1018,9 +1017,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1051,9 +1050,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1086,9 +1085,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1149,9 +1148,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1197,9 +1196,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1264,9 +1263,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1297,9 +1296,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1331,9 +1330,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1369,9 +1368,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1409,9 +1408,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1484,8 +1483,10 @@ int main(int argc, char** argv) {
             (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
 
     // print the header
-    std::cout << "Codec\t" << "n\t" << "d\t" << "Experiment\t" << "Iterations\t"
-              << "Faiss time\t" << "SADecodeKernel time\t" << "Error"
+    auto delim = "\t";
+    std::cout << "Codec" << delim << "n" << delim << "d" << delim
+              << "Experiment" << delim << "Iterations" << delim << "Faiss time"
+              << delim << "SADecodeKernel time" << delim << "Error"
               << std::endl;
 
     // The following experiment types are available:
diff --git a/thirdparty/faiss/benchs/bench_fw/benchmark.py b/thirdparty/faiss/benchs/bench_fw/benchmark.py
index 1053f9938..237d08bd9 100644
--- a/thirdparty/faiss/benchs/bench_fw/benchmark.py
+++ b/thirdparty/faiss/benchs/bench_fw/benchmark.py
@@ -4,8 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from copy import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from operator import itemgetter
 from statistics import mean, median
 from typing import Any, Dict, List, Optional
@@ -16,7 +15,16 @@
 
 from scipy.optimize import curve_fit
 
-from .descriptors import DatasetDescriptor, IndexDescriptor
+from .benchmark_io import BenchmarkIO
+
+from .descriptors import (
+    CodecDescriptor,
+    DatasetDescriptor,
+    IndexDescriptor,
+    IndexDescriptorClassic,
+    KnnDescriptor,
+)
+
 from .index import Index, IndexFromCodec, IndexFromFactory
 
 from .utils import dict_merge
@@ -185,15 +193,9 @@ def sigmoid(x, a, b, c):
 
 
 @dataclass
-class Benchmark:
+class IndexOperator:
     num_threads: int
-    training_vectors: Optional[DatasetDescriptor] = None
-    database_vectors: Optional[DatasetDescriptor] = None
-    query_vectors: Optional[DatasetDescriptor] = None
-    index_descs: Optional[List[IndexDescriptor]] = None
-    range_ref_index_desc: Optional[str] = None
-    k: Optional[int] = None
-    distance_metric: str = "L2"
+    distance_metric: str
 
     def __post_init__(self):
         if self.distance_metric == "IP":
@@ -203,18 +205,167 @@ def __post_init__(self):
         else:
             raise ValueError
 
-    def set_io(self, benchmark_io):
+    def set_io(self, benchmark_io: BenchmarkIO):
         self.io = benchmark_io
         self.io.distance_metric = self.distance_metric
         self.io.distance_metric_type = self.distance_metric_type
 
-    def get_index_desc(self, factory: str) -> Optional[IndexDescriptor]:
+
+@dataclass
+class TrainOperator(IndexOperator):
+    codec_descs: List[CodecDescriptor] = field(default_factory=lambda: [])
+
+    def get_desc(self, name: str) -> Optional[CodecDescriptor]:
+        for desc in self.codec_descs:
+            if desc.get_name() == name:
+                return desc
+            elif desc.factory == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[CodecDescriptor]:
+        for desc in self.codec_descs:
+            desc_name = desc.get_name()
+            if desc_name == name:
+                return desc
+            if desc_name.startswith("Flat"):
+                return desc
+        return None
+
+    def build_index_wrapper(self, codec_desc: CodecDescriptor):
+        if hasattr(codec_desc, "index"):
+            return
+
+        if codec_desc.factory is not None:
+            assert (
+                codec_desc.factory == "Flat" or codec_desc.training_vectors is not None
+            )
+            index = IndexFromFactory(
+                num_threads=self.num_threads,
+                d=codec_desc.d,
+                metric=self.distance_metric,
+                construction_params=codec_desc.construction_params,
+                factory=codec_desc.factory,
+                training_vectors=codec_desc.training_vectors,
+                codec_name=codec_desc.get_name(),
+            )
+            index.set_io(self.io)
+            codec_desc.index = index
+        else:
+            assert codec_desc.is_trained()
+
+    def train(
+        self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run=False
+    ):
+        self.build_index_wrapper(codec_desc)
+        if codec_desc.is_trained():
+            return results, None
+
+        if dry_run:
+            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
+        else:
+            codec_desc.index.get_codec()
+            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
+            assert requires is None
+
+        if requires is None:
+            results["indices"][codec_desc.get_name()] = meta
+        return results, requires
+
+
+@dataclass
+class BuildOperator(IndexOperator):
+    index_descs: List[IndexDescriptor] = field(default_factory=lambda: [])
+
+    def get_desc(self, name: str) -> Optional[IndexDescriptor]:
+        for desc in self.index_descs:
+            if desc.get_name() == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[IndexDescriptor]:
         for desc in self.index_descs:
-            if desc.factory == factory:
+            desc_name = desc.get_name()
+            if desc_name == name:
+                return desc
+            if desc_name.startswith("Flat"):
+                return desc
+        return None
+
+    def build_index_wrapper(self, index_desc: IndexDescriptor):
+        if hasattr(index_desc, "index"):
+            return
+
+        if hasattr(index_desc.codec_desc, "index"):
+            index_desc.index = index_desc.codec_desc.index
+            index_desc.index.database_vectors = index_desc.database_desc
+            index_desc.index.index_name = index_desc.get_name()
+            return
+
+        if index_desc.codec_desc is not None:
+            index = IndexFromCodec(
+                num_threads=self.num_threads,
+                d=index_desc.d,
+                metric=self.distance_metric,
+                database_vectors=index_desc.database_desc,
+                bucket=index_desc.codec_desc.bucket,
+                path=index_desc.codec_desc.path,
+                index_name=index_desc.get_name(),
+                codec_name=index_desc.codec_desc.get_name(),
+            )
+            index.set_io(self.io)
+            index_desc.index = index
+        else:
+            assert index_desc.is_built()
+
+    def build(self, index_desc: IndexDescriptor, results: Dict[str, Any]):
+        self.build_index_wrapper(index_desc)
+        if index_desc.is_built():
+            return
+        index_desc.index.get_index()
+
+
+@dataclass
+class SearchOperator(IndexOperator):
+    knn_descs: List[KnnDescriptor] = field(default_factory=lambda: [])
+    range: bool = False
+
+    def get_desc(self, name: str) -> Optional[KnnDescriptor]:
+        for desc in self.knn_descs:
+            if desc.get_name() == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[KnnDescriptor]:
+        for desc in self.knn_descs:
+            if desc.get_name().startswith("Flat"):
                 return desc
         return None
 
-    def range_search_reference(self, index, parameters, range_metric):
+    def build_index_wrapper(self, knn_desc: KnnDescriptor):
+        if hasattr(knn_desc, "index"):
+            return
+
+        if knn_desc.index_desc.index is not None:
+            knn_desc.index = knn_desc.index_desc.index
+            knn_desc.index.knn_name = knn_desc.get_name()
+            knn_desc.index.search_params = knn_desc.search_params
+        else:
+            index = Index(
+                num_threads=self.num_threads,
+                d=knn_desc.d,
+                metric=self.distance_metric,
+                bucket=knn_desc.index_desc.bucket,
+                index_path=knn_desc.index_desc.path,
+                # knn_name=knn_desc.get_name(),
+                search_params=knn_desc.search_params,
+            )
+            index.set_io(self.io)
+            knn_desc.index = index
+
+        knn_desc.index.get_index()
+
+    def range_search_reference(self, index, parameters, range_metric, query_dataset):
         logger.info("range_search_reference: begin")
         if isinstance(range_metric, list):
             assert len(range_metric) > 0
@@ -231,8 +382,9 @@ def range_search_reference(self, index, parameters, range_metric):
             index,
             parameters,
             radius=m_radius,
+            query_dataset=query_dataset,
         )
-        flat = index.factory == "Flat"
+        flat = index.is_flat_index()
         (
             gt_radius,
             range_search_metric_function,
@@ -251,11 +403,11 @@ def range_search_reference(self, index, parameters, range_metric):
             coefficients_training_data,
         )
 
-    def estimate_range(self, index, parameters, range_scoring_radius):
+    def estimate_range(self, index, parameters, range_scoring_radius, query_dataset):
         D, I, R, P, _ = index.knn_search(
             False,
             parameters,
-            self.query_vectors,
+            query_dataset,
             self.k,
         )
         samples = []
@@ -273,6 +425,7 @@ def range_search(
         dry_run,
         index: Index,
         search_parameters: Optional[Dict[str, int]],
+        query_dataset: DatasetDescriptor,
         radius: Optional[float] = None,
         gt_radius: Optional[float] = None,
         range_search_metric_function=None,
@@ -285,25 +438,21 @@ def range_search(
                 gt_radius
                 if index.is_flat()
                 else self.estimate_range(
-                    index,
-                    search_parameters,
-                    gt_radius,
+                    index, search_parameters, gt_radius, query_dataset
                 )
             )
         logger.info(f"Radius={radius}")
         lims, D, I, R, P, requires = index.range_search(
             dry_run=dry_run,
             search_parameters=search_parameters,
-            query_vectors=self.query_vectors,
+            query_vectors=query_dataset,
             radius=radius,
         )
         if requires is not None:
             return None, None, None, None, None, requires
         if range_search_metric_function is not None:
             range_search_metric = range_search_metric_function(R)
-            range_search_pr = range_search_pr_curve(
-                D, range_search_metric, gt_rsm
-            )
+            range_search_pr = range_search_pr_curve(D, range_search_metric, gt_rsm)
             range_score_sum = np.sum(range_search_metric).item()
             P |= {
                 "range_score_sum": range_score_sum,
@@ -312,23 +461,29 @@ def range_search(
             }
         return lims, D, I, R, P, requires
 
-    def range_ground_truth(self, gt_radius, range_search_metric_function):
+    def range_ground_truth(
+        self, gt_radius, range_search_metric_function, flat_desc=None
+    ):
         logger.info("range_ground_truth: begin")
-        flat_desc = self.get_index_desc("Flat")
+        if flat_desc is None:
+            flat_desc = self.get_flat_desc()
         lims, D, I, R, P, _ = self.range_search(
             False,
             flat_desc.index,
             search_parameters=None,
             radius=gt_radius,
+            query_dataset=flat_desc.query_dataset,
         )
         gt_rsm = np.sum(range_search_metric_function(R)).item()
         logger.info("range_ground_truth: end")
         return gt_rsm
 
-    def knn_ground_truth(self):
+    def knn_ground_truth(self, flat_desc=None):
         logger.info("knn_ground_truth: begin")
-        flat_desc = self.get_index_desc("Flat")
+        if flat_desc is None:
+            flat_desc = self.get_flat_desc()
         self.build_index_wrapper(flat_desc)
+        # TODO(kuarora): Consider moving gt results(gt_knn_D, gt_knn_I) to the index as there can be multiple ground truths.
         (
             self.gt_knn_D,
             self.gt_knn_I,
@@ -338,8 +493,8 @@ def knn_ground_truth(self):
         ) = flat_desc.index.knn_search(
             dry_run=False,
             search_parameters=None,
-            query_vectors=self.query_vectors,
-            k=self.k,
+            query_vectors=flat_desc.query_dataset,
+            k=flat_desc.k,
         )
         assert requires is None
         logger.info("knn_ground_truth: end")
@@ -369,6 +524,7 @@ def experiment(parameters, cost_metric, perf_metric):
                 results["experiments"][key] = metrics
             return metrics[cost_metric], metrics[perf_metric], None
 
+        requires = None
         for cost_metric in cost_metrics:
             for perf_metric in perf_metrics:
                 op = index.get_operating_points()
@@ -384,52 +540,52 @@ def experiment(parameters, cost_metric, perf_metric):
         return results, requires
 
     def knn_search_benchmark(
-        self, dry_run, results: Dict[str, Any], index: Index
+        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
     ):
         return self.search_benchmark(
             name="knn_search",
-            search_func=lambda parameters: index.knn_search(
+            search_func=lambda parameters: knn_desc.index.knn_search(
                 dry_run,
                 parameters,
-                self.query_vectors,
-                self.k,
+                knn_desc.query_dataset,
+                knn_desc.k,
                 self.gt_knn_I,
                 self.gt_knn_D,
             )[3:],
-            key_func=lambda parameters: index.get_knn_search_name(
+            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
-                k=self.k,
+                query_vectors=knn_desc.query_dataset,
+                k=knn_desc.k,
                 reconstruct=False,
             ),
             cost_metrics=["time"],
             perf_metrics=["knn_intersection", "distance_ratio"],
             results=results,
-            index=index,
+            index=knn_desc.index,
         )
 
     def reconstruct_benchmark(
-        self, dry_run, results: Dict[str, Any], index: Index
+        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
     ):
         return self.search_benchmark(
             name="reconstruct",
-            search_func=lambda parameters: index.reconstruct(
+            search_func=lambda parameters: knn_desc.index.reconstruct(
                 dry_run,
                 parameters,
-                self.query_vectors,
-                self.k,
+                knn_desc.query_dataset,
+                knn_desc.k,
                 self.gt_knn_I,
             ),
-            key_func=lambda parameters: index.get_knn_search_name(
+            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
-                k=self.k,
+                query_vectors=knn_desc.query_dataset,
+                k=knn_desc.k,
                 reconstruct=True,
             ),
             cost_metrics=["encode_time"],
             perf_metrics=["sym_recall"],
             results=results,
-            index=index,
+            index=knn_desc.index,
         )
 
     def range_search_benchmark(
@@ -442,6 +598,7 @@ def range_search_benchmark(
         gt_radius: float,
         range_search_metric_function,
         gt_rsm: float,
+        query_dataset: DatasetDescriptor,
     ):
         return self.search_benchmark(
             name="range_search",
@@ -453,10 +610,11 @@ def range_search_benchmark(
                 gt_radius=gt_radius,
                 range_search_metric_function=range_search_metric_function,
                 gt_rsm=gt_rsm,
+                query_dataset=query_dataset,
             )[4:],
             key_func=lambda parameters: index.get_range_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
+                query_vectors=query_dataset,
                 radius=radius,
             )
             + metric_key,
@@ -466,69 +624,88 @@ def range_search_benchmark(
             index=index,
         )
 
-    def build_index_wrapper(self, index_desc: IndexDescriptor):
-        if hasattr(index_desc, "index"):
-            return
-        if index_desc.factory is not None:
-            training_vectors = copy(self.training_vectors)
-            if index_desc.training_size is not None:
-                training_vectors.num_vectors = index_desc.training_size
-            index = IndexFromFactory(
-                num_threads=self.num_threads,
-                d=self.d,
-                metric=self.distance_metric,
-                database_vectors=self.database_vectors,
-                search_params=index_desc.search_params,
-                construction_params=index_desc.construction_params,
-                factory=index_desc.factory,
-                training_vectors=training_vectors,
-            )
+
+@dataclass
+class ExecutionOperator:
+    distance_metric: str = "L2"
+    num_threads: int = 1
+    train_op: Optional[TrainOperator] = None
+    build_op: Optional[BuildOperator] = None
+    search_op: Optional[SearchOperator] = None
+
+    def __post_init__(self):
+        if self.distance_metric == "IP":
+            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
+        elif self.distance_metric == "L2":
+            self.distance_metric_type = faiss.METRIC_L2
         else:
-            index = IndexFromCodec(
-                num_threads=self.num_threads,
-                d=self.d,
-                metric=self.distance_metric,
-                database_vectors=self.database_vectors,
-                search_params=index_desc.search_params,
-                construction_params=index_desc.construction_params,
-                path=index_desc.path,
-                bucket=index_desc.bucket,
-            )
-        index.set_io(self.io)
-        index_desc.index = index
+            raise ValueError
 
-    def clone_one(self, index_desc):
-        benchmark = Benchmark(
-            num_threads=self.num_threads,
-            training_vectors=self.training_vectors,
-            database_vectors=self.database_vectors,
-            query_vectors=self.query_vectors,
-            index_descs=[self.get_index_desc("Flat"), index_desc],
-            range_ref_index_desc=self.range_ref_index_desc,
-            k=self.k,
-            distance_metric=self.distance_metric,
-        )
-        benchmark.set_io(self.io.clone())
-        return benchmark
+    def set_io(self, io: BenchmarkIO):
+        self.io = io
+        self.io.distance_metric = self.distance_metric
+        self.io.distance_metric_type = self.distance_metric_type
+        if self.train_op:
+            self.train_op.set_io(io)
+        if self.build_op:
+            self.build_op.set_io(io)
+        if self.search_op:
+            self.search_op.set_io(io)
+
+    def train_one(self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run):
+        faiss.omp_set_num_threads(self.num_threads)
+        assert self.train_op is not None
+        self.train_op.train(codec_desc, results, dry_run)
+
+    def train(self, results, dry_run=False):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.train_op is None:
+            return
+
+        for codec_desc in self.train_op.codec_descs:
+            self.train_one(codec_desc, results, dry_run)
+
+    def build_one(self, results: Dict[str, Any], index_desc: IndexDescriptor):
+        faiss.omp_set_num_threads(self.num_threads)
+        assert self.build_op is not None
+        self.build_op.build(index_desc, results)
+
+    def build(self, results: Dict[str, Any]):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.build_op is None:
+            return
+
+        for index_desc in self.build_op.index_descs:
+            self.build_one(index_desc, results)
+
+    def search(self):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.search_op is None:
+            return
 
-    def benchmark_one(
+        for index_desc in self.search_op.knn_descs:
+            self.search_one(index_desc)
+
+    def search_one(
         self,
-        dry_run,
+        knn_desc: KnnDescriptor,
         results: Dict[str, Any],
-        index_desc: IndexDescriptor,
-        train,
-        reconstruct,
-        knn,
-        range,
+        dry_run=False,
+        range=False,
     ):
         faiss.omp_set_num_threads(self.num_threads)
+        assert self.search_op is not None
+
         if not dry_run:
-            self.knn_ground_truth()
-        self.build_index_wrapper(index_desc)
-        meta, requires = index_desc.index.fetch_meta(dry_run=dry_run)
+            self.create_gt_knn(knn_desc)
+            self.create_range_ref_knn(knn_desc)
+
+        self.search_op.build_index_wrapper(knn_desc)
+        meta, requires = knn_desc.index.fetch_meta(dry_run=dry_run)
         if requires is not None:
-            return results, (requires if train else None)
-        results["indices"][index_desc.index.get_codec_name()] = meta
+            # return results, (requires if train else None)
+            return results, requires
+        results["indices"][knn_desc.index.get_codec_name()] = meta
 
         # results, requires = self.reconstruct_benchmark(
         #     dry_run=True,
@@ -545,33 +722,32 @@ def benchmark_one(
         #             index=index_desc.index,
         #         )
         #         assert requires is None
-
-        results, requires = self.knn_search_benchmark(
+        results, requires = self.search_op.knn_search_benchmark(
             dry_run=True,
             results=results,
-            index=index_desc.index,
+            knn_desc=knn_desc,
         )
-        if knn and requires is not None:
+        if requires is not None:
             if dry_run:
                 return results, requires
             else:
-                results, requires = self.knn_search_benchmark(
+                results, requires = self.search_op.knn_search_benchmark(
                     dry_run=False,
                     results=results,
-                    index=index_desc.index,
+                    knn_desc=knn_desc,
                 )
                 assert requires is None
 
         if (
-            self.range_ref_index_desc is None
-            or not index_desc.index.supports_range_search()
+            knn_desc.range_ref_index_desc is None or
+            not knn_desc.index.supports_range_search()
         ):
             return results, None
 
-        ref_index_desc = self.get_index_desc(self.range_ref_index_desc)
+        ref_index_desc = self.search_op.get_desc(knn_desc.range_ref_index_desc)
         if ref_index_desc is None:
             raise ValueError(
-                f"Unknown range index {self.range_ref_index_desc}"
+                f"{knn_desc.get_name()}: Unknown range index {knn_desc.range_ref_index_desc}"
             )
         if ref_index_desc.range_metrics is None:
             raise ValueError(
@@ -583,91 +759,360 @@ def benchmark_one(
                 range_search_metric_function,
                 coefficients,
                 coefficients_training_data,
-            ) = self.range_search_reference(
+            ) = self.search_op.range_search_reference(
                 ref_index_desc.index,
                 ref_index_desc.search_params,
                 range_metric,
             )
-            gt_rsm = self.range_ground_truth(
+            gt_rsm = self.search_op.range_ground_truth(
                 gt_radius, range_search_metric_function
             )
-            results, requires = self.range_search_benchmark(
+            results, requires = self.search_op.range_search_benchmark(
                 dry_run=True,
                 results=results,
-                index=index_desc.index,
+                index=knn_desc.index,
                 metric_key=metric_key,
-                radius=index_desc.radius,
+                radius=knn_desc.radius,
                 gt_radius=gt_radius,
                 range_search_metric_function=range_search_metric_function,
                 gt_rsm=gt_rsm,
+                query_vectors=knn_desc.query_dataset,
             )
             if range and requires is not None:
                 if dry_run:
                     return results, requires
                 else:
-                    results, requires = self.range_search_benchmark(
+                    results, requires = self.search_op.range_search_benchmark(
                         dry_run=False,
                         results=results,
-                        index=index_desc.index,
+                        index=knn_desc.index,
                         metric_key=metric_key,
-                        radius=index_desc.radius,
+                        radius=knn_desc.radius,
                         gt_radius=gt_radius,
                         range_search_metric_function=range_search_metric_function,
                         gt_rsm=gt_rsm,
+                        query_vectors=knn_desc.query_dataset,
                     )
                     assert requires is None
 
         return results, None
 
-    def benchmark(
-        self,
-        result_file=None,
-        local=False,
-        train=False,
-        reconstruct=False,
-        knn=False,
-        range=False,
-    ):
-        logger.info("begin evaluate")
+    def create_gt_codec(
+        self, codec_desc, results, train=True
+    ) -> Optional[CodecDescriptor]:
+        gt_codec_desc = None
+        if self.train_op:
+            gt_codec_desc = self.train_op.get_flat_desc(codec_desc.flat_name())
+            if gt_codec_desc is None:
+                gt_codec_desc = CodecDescriptor(
+                    factory="Flat",
+                    d=codec_desc.d,
+                    metric=codec_desc.metric,
+                    num_threads=self.num_threads,
+                )
+                self.train_op.codec_descs.insert(0, gt_codec_desc)
+            if train:
+                self.train_op.train(gt_codec_desc, results, dry_run=False)
 
-        faiss.omp_set_num_threads(self.num_threads)
-        results = {"indices": {}, "experiments": {}}
-        xq = self.io.get_dataset(self.query_vectors)
-        self.d = xq.shape[1]
-        if self.get_index_desc("Flat") is None:
-            self.index_descs.append(IndexDescriptor(factory="Flat"))
+        return gt_codec_desc
 
-        self.knn_ground_truth()
+    def create_gt_index(
+        self, index_desc: IndexDescriptor, results: Dict[str, Any], build=True
+    ) -> Optional[IndexDescriptor]:
+        gt_index_desc = None
+        if self.build_op:
+            gt_index_desc = self.build_op.get_flat_desc(index_desc.flat_name())
+            if gt_index_desc is None:
+                gt_codec_desc = self.train_op.get_flat_desc(
+                    index_desc.codec_desc.flat_name()
+                )
+                assert gt_codec_desc is not None
+                gt_index_desc = IndexDescriptor(
+                    d=index_desc.d,
+                    metric=index_desc.metric,
+                    num_threads=self.num_threads,
+                    codec_desc=gt_codec_desc,
+                    database_desc=index_desc.database_desc,
+                )
+                self.build_op.index_descs.insert(0, gt_index_desc)
+            if build:
+                self.build_op.build(gt_index_desc, results)
 
-        if self.range_ref_index_desc is not None:
-            index_desc = self.get_index_desc(self.range_ref_index_desc)
-            if index_desc is None:
-                raise ValueError(
-                    f"Unknown range index {self.range_ref_index_desc}"
+        return gt_index_desc
+
+    def create_gt_knn(self, knn_desc, search=True) -> Optional[KnnDescriptor]:
+        gt_knn_desc = None
+        if self.search_op:
+            gt_knn_desc = self.search_op.get_flat_desc(knn_desc.flat_name())
+            if gt_knn_desc is None:
+                gt_index_desc = self.build_op.get_flat_desc(
+                    knn_desc.index_desc.flat_name()
+                )
+                assert gt_index_desc is not None
+                gt_knn_desc = KnnDescriptor(
+                    d=knn_desc.d,
+                    metric=knn_desc.metric,
+                    num_threads=self.num_threads,
+                    index_desc=gt_index_desc,
+                    query_dataset=knn_desc.query_dataset,
+                    k=knn_desc.k,
                 )
-            if index_desc.range_metrics is None:
+                self.search_op.knn_descs.insert(0, gt_knn_desc)
+            if search:
+                self.search_op.build_index_wrapper(gt_knn_desc)
+                self.search_op.knn_ground_truth(gt_knn_desc)
+
+        return gt_knn_desc
+
+    def create_range_ref_knn(self, knn_desc):
+        if (
+            knn_desc.range_ref_index_desc is None or
+            not knn_desc.index.supports_range_search()
+        ):
+            return
+
+        if knn_desc.range_ref_index_desc is not None:
+            ref_index_desc = self.get_desc(knn_desc.range_ref_index_desc)
+            if ref_index_desc is None:
+                raise ValueError(f"Unknown range index {knn_desc.range_ref_index_desc}")
+            if ref_index_desc.range_metrics is None:
                 raise ValueError(
-                    f"Range index {index_desc.factory} has no radius_score"
+                    f"Range index {knn_desc.get_name()} has no radius_score"
                 )
             results["metrics"] = {}
-            for metric_key, range_metric in index_desc.range_metrics.items():
+            self.build_index_wrapper(ref_index_desc)
+            for metric_key, range_metric in ref_index_desc.range_metrics.items():
                 (
-                    gt_radius,
+                    knn_desc.gt_radius,
                     range_search_metric_function,
                     coefficients,
                     coefficients_training_data,
                 ) = self.range_search_reference(
-                    index_desc.index, index_desc.search_params, range_metric
+                    knn_desc.index, knn_desc.search_params, range_metric
                 )
                 results["metrics"][metric_key] = {
                     "coefficients": coefficients,
                     "training_data": coefficients_training_data,
                 }
-                gt_rsm = self.range_ground_truth(
-                    gt_radius, range_search_metric_function
+                knn_desc.gt_rsm = self.range_ground_truth(
+                    knn_desc.gt_radius, range_search_metric_function
+                )
+
+    def create_ground_truths(self, results: Dict[str, Any]):
+        # TODO: Create all ground truth descriptors and put them in index descriptor as reference
+        if self.train_op is not None:
+            for codec_desc in self.train_op.codec_descs:
+                self.create_gt_codec(codec_desc, results)
+
+        if self.build_op is not None:
+            for index_desc in self.build_op.index_descs:
+                self.create_gt_index(
+                    index_desc, results
+                )  # may need to pass results in future
+
+        if self.search_op is not None:
+            for knn_desc in self.search_op.knn_descs:
+                self.create_gt_knn(knn_desc, results)
+                self.create_range_ref_knn(knn_desc)
+
+    def execute(self, results: Dict[str, Any], dry_run: False):
+        if self.train_op is not None:
+            for desc in self.train_op.codec_descs:
+                results, requires = self.train_op.train(desc, results, dry_run=dry_run)
+                if dry_run:
+                    if requires is None:
+                        continue
+                    return results, requires
+                assert requires is None
+
+        if self.build_op is not None:
+            for desc in self.build_op.index_descs:
+                self.build_op.build(desc, results)
+        if self.search_op is not None:
+            for desc in self.search_op.knn_descs:
+                results, requires = self.search_one(
+                    knn_desc=desc, results=results, dry_run=dry_run, range=self.search_op.range
+                )
+                if dry_run:
+                    if requires is None:
+                        continue
+                    return results, requires
+
+                assert requires is None
+        return results, None
+
+    def execute_2(self, result_file=None):
+        results = {"indices": {}, "experiments": {}}
+        results, requires = self.execute(results=results)
+        assert requires is None
+        if result_file is not None:
+            self.io.write_json(results, result_file, overwrite=True)
+
+    def add_index_descs(self, codec_desc, index_desc, knn_desc):
+        if codec_desc is not None:
+            self.train_op.codec_descs.append(codec_desc)
+        if index_desc is not None:
+            self.build_op.index_descs.append(index_desc)
+        if knn_desc is not None:
+            self.search_op.knn_descs.append(knn_desc)
+
+
+@dataclass
+class Benchmark:
+    num_threads: int
+    training_vectors: Optional[DatasetDescriptor] = None
+    database_vectors: Optional[DatasetDescriptor] = None
+    query_vectors: Optional[DatasetDescriptor] = None
+    index_descs: Optional[List[IndexDescriptorClassic]] = None
+    range_ref_index_desc: Optional[str] = None
+    k: int = 1
+    distance_metric: str = "L2"
+
+    def set_io(self, benchmark_io):
+        self.io = benchmark_io
+
+    def get_embedding_dimension(self):
+        if self.training_vectors is not None:
+            xt = self.io.get_dataset(self.training_vectors)
+            return xt.shape[1]
+        if self.database_vectors is not None:
+            xb = self.io.get_dataset(self.database_vectors)
+            return xb.shape[1]
+        if self.query_vectors is not None:
+            xq = self.io.get_dataset(self.query_vectors)
+            return xq.shape[1]
+        raise ValueError("Failed to determine dimension of dataset")
+
+    def create_descriptors(
+        self, ci_desc: IndexDescriptorClassic, train, build, knn, reconstruct, range
+    ):
+        codec_desc = None
+        index_desc = None
+        knn_desc = None
+        dim = self.get_embedding_dimension()
+        if train and ci_desc.factory is not None:
+            codec_desc = CodecDescriptor(
+                d=dim,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                factory=ci_desc.factory,
+                construction_params=ci_desc.construction_params,
+                training_vectors=self.training_vectors,
+            )
+        if build:
+            if codec_desc is None:
+                assert ci_desc.path is not None
+                codec_desc = CodecDescriptor(
+                    d=dim,
+                    metric=self.distance_metric,
+                    num_threads=self.num_threads,
+                    bucket=ci_desc.bucket,
+                    path=ci_desc.path,
+                )
+            index_desc = IndexDescriptor(
+                d=codec_desc.d,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                codec_desc=codec_desc,
+                database_desc=self.database_vectors,
+            )
+        if knn or range:
+            if index_desc is None:
+                assert ci_desc.path is not None
+                index_desc = IndexDescriptor(
+                    d=dim,
+                    metric=self.distance_metric,
+                    num_threads=self.num_threads,
+                    bucket=ci_desc.bucket,
+                    path=ci_desc.path,
                 )
+            knn_desc = KnnDescriptor(
+                d=dim,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                index_desc=index_desc,
+                query_dataset=self.query_vectors,
+                search_params=ci_desc.search_params,
+                range_metrics=ci_desc.range_metrics,
+                radius=ci_desc.radius,
+                k=self.k,
+            )
 
-        self.index_descs = list(dict.fromkeys(self.index_descs))
+        return codec_desc, index_desc, knn_desc
+
+    def create_execution_operator(
+        self,
+        train,
+        build,
+        knn,
+        reconstruct,
+        range,
+    ) -> ExecutionOperator:
+        # all operators are created, as ground truth are always created in benchmarking
+        train_op = TrainOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        build_op = BuildOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        search_op = SearchOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        search_op.range = range
+
+        exec_op = ExecutionOperator(
+            train_op=train_op,
+            build_op=build_op,
+            search_op=search_op,
+            num_threads=self.num_threads,
+        )
+        assert hasattr(self, "io")
+        exec_op.set_io(self.io)
+
+        # iterate over classic descriptors
+        for ci_desc in self.index_descs:
+            codec_desc, index_desc, knn_desc = self.create_descriptors(
+                ci_desc, train, build, knn, reconstruct, range
+            )
+            exec_op.add_index_descs(codec_desc, index_desc, knn_desc)
+
+        return exec_op
+
+    def clone_one(self, index_desc):
+        benchmark = Benchmark(
+            num_threads=self.num_threads,
+            training_vectors=self.training_vectors,
+            database_vectors=self.database_vectors,
+            query_vectors=self.query_vectors,
+            # index_descs=[self.get_flat_desc("Flat"), index_desc],
+            index_descs=[index_desc],  # Should automatically find flat descriptors
+            range_ref_index_desc=self.range_ref_index_desc,
+            k=self.k,
+            distance_metric=self.distance_metric,
+        )
+        benchmark.set_io(self.io.clone())
+        return benchmark
+
+    def benchmark(
+        self,
+        result_file=None,
+        local=False,
+        train=False,
+        reconstruct=False,
+        knn=False,
+        range=False,
+    ):
+        logger.info("begin evaluate")
+        results = {"indices": {}, "experiments": {}}
+        faiss.omp_set_num_threads(self.num_threads)
+        exec_op = self.create_execution_operator(
+            train=train,
+            build=knn or range,
+            knn=knn,
+            reconstruct=reconstruct,
+            range=range,
+        )
+        exec_op.create_ground_truths(results)
 
         todo = self.index_descs
         for index_desc in self.index_descs:
@@ -678,15 +1123,7 @@ def benchmark(
             current_todo = []
             next_todo = []
             for index_desc in todo:
-                results, requires = self.benchmark_one(
-                    dry_run=True,
-                    results=results,
-                    index_desc=index_desc,
-                    train=train,
-                    reconstruct=reconstruct,
-                    knn=knn,
-                    range=range,
-                )
+                results, requires = exec_op.execute(results, dry_run=False)
                 if requires is None:
                     continue
                 if requires in queued:
@@ -728,15 +1165,14 @@ def benchmark(
 def run_benchmark_one(params):
     logger.info(params)
     index_desc, benchmark, results, train, reconstruct, knn, range = params
-    results, requires = benchmark.benchmark_one(
-        dry_run=False,
-        results=results,
-        index_desc=index_desc,
+    exec_op = benchmark.create_execution_operator(
         train=train,
-        reconstruct=reconstruct,
+        build=knn,
         knn=knn,
+        reconstruct=reconstruct,
         range=range,
     )
+    results, requires = exec_op.execute(results=results, dry_run=False)
     assert requires is None
     assert results is not None
     return results
diff --git a/thirdparty/faiss/benchs/bench_fw/benchmark_io.py b/thirdparty/faiss/benchs/bench_fw/benchmark_io.py
index b39bb6029..5ee3eb3a6 100644
--- a/thirdparty/faiss/benchs/bench_fw/benchmark_io.py
+++ b/thirdparty/faiss/benchs/bench_fw/benchmark_io.py
@@ -53,6 +53,7 @@ def clone(self):
     def __post_init__(self):
         self.cached_ds = {}
 
+    # TODO(kuarora): rename it as get_local_file
     def get_local_filename(self, filename):
         if len(filename) > 184:
             fn, ext = os.path.splitext(filename)
@@ -61,6 +62,9 @@ def get_local_filename(self, filename):
             )
         return os.path.join(self.path, filename)
 
+    def get_remote_filepath(self, filename) -> Optional[str]:
+        return None
+
     def download_file_from_blobstore(
         self,
         filename: str,
@@ -219,7 +223,7 @@ def read_index(
         fn = self.download_file_from_blobstore(filename, bucket, path)
         logger.info(f"Loading index {fn}")
         ext = os.path.splitext(fn)[1]
-        if ext in [".faiss", ".codec"]:
+        if ext in [".faiss", ".codec", ".index"]:
             index = faiss.read_index(fn)
         elif ext == ".pkl":
             with open(fn, "rb") as model_file:
diff --git a/thirdparty/faiss/benchs/bench_fw/descriptors.py b/thirdparty/faiss/benchs/bench_fw/descriptors.py
index f1dd7354c..e76278ced 100644
--- a/thirdparty/faiss/benchs/bench_fw/descriptors.py
+++ b/thirdparty/faiss/benchs/bench_fw/descriptors.py
@@ -3,23 +3,27 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass
 import logging
+import os
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
+
+from .benchmark_io import BenchmarkIO
 from .utils import timer
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
-class IndexDescriptor:
+class IndexDescriptorClassic:
     bucket: Optional[str] = None
     # either path or factory should be set,
     # but not both at the same time.
     path: Optional[str] = None
     factory: Optional[str] = None
+    codec_alias: Optional[str] = None
     construction_params: Optional[List[Dict[str, int]]] = None
     search_params: Optional[Dict[str, int]] = None
     # range metric definitions
@@ -44,7 +48,6 @@ class IndexDescriptor:
     def __hash__(self):
         return hash(str(self))
 
-
 @dataclass
 class DatasetDescriptor:
     # namespace possible values:
@@ -80,7 +83,7 @@ def __hash__(self):
 
     def get_filename(
         self,
-        prefix: str = None,
+        prefix: Optional[str] = None,
     ) -> str:
         filename = ""
         if prefix is not None:
@@ -115,3 +118,208 @@ def k_means(self, io, k, dry_run):
         else:
             t = io.read_json(meta_filename)["k_means_time"]
         return kmeans_vectors, t, None
+
+@dataclass
+class IndexBaseDescriptor:
+    d: int
+    metric: str
+    desc_name: Optional[str] = None
+    flat_desc_name: Optional[str] = None
+    bucket: Optional[str] = None
+    path: Optional[str] = None
+    num_threads: int = 1
+
+    def get_name(self) -> str:
+        raise NotImplementedError()
+
+    def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
+        if self.path is not None:
+            return self.path
+        self.path = benchmark_io.get_remote_filepath(self.desc_name)
+        return self.path
+
+    @staticmethod
+    def param_dict_list_to_name(param_dict_list):
+        if not param_dict_list:
+            return ""
+        l = 0
+        n = ""
+        for param_dict in param_dict_list:
+            n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
+            l += 1
+        return n
+
+    @staticmethod
+    def param_dict_to_name(param_dict, prefix="sp"):
+        if not param_dict:
+            return ""
+        n = prefix
+        for name, val in param_dict.items():
+            if name == "snap":
+                continue
+            if name == "lsq_gpu" and val == 0:
+                continue
+            if name == "use_beam_LUT" and val == 0:
+                continue
+            n += f"_{name}_{val}"
+        if n == prefix:
+            return ""
+        n += "."
+        return n
+
+
+@dataclass
+class CodecDescriptor(IndexBaseDescriptor):
+    # either path or factory should be set,
+    # but not both at the same time.
+    factory: Optional[str] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    training_vectors: Optional[DatasetDescriptor] = None
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_trained(self):
+        return self.factory is None and self.path is not None
+
+    def is_valid(self):
+        return self.factory is not None or self.path is not None
+
+    def get_name(self) -> str:
+        if self.desc_name is not None:
+            return self.desc_name
+        if self.factory is not None:
+            self.desc_name = self.name_from_factory()
+            return self.desc_name
+        if self.path is not None:
+            self.desc_name = self.name_from_path()
+            return self.desc_name
+        raise ValueError("name, factory or path must be set")
+
+    def flat_name(self) -> str:
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
+        return self.flat_desc_name
+
+    def path(self, benchmark_io) -> str:
+        if self.path is not None:
+            return self.path
+        return benchmark_io.get_remote_filepath(self.get_name())
+
+    def name_from_factory(self) -> str:
+        assert self.factory is not None
+        name = f"{self.factory.replace(',', '_')}."
+        assert self.d is not None
+        assert self.metric is not None
+        name += f"d_{self.d}.{self.metric.upper()}."
+        if self.factory != "Flat":
+            assert self.training_vectors is not None
+            name += self.training_vectors.get_filename("xt")
+        name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
+        return name
+
+    def name_from_path(self):
+        assert self.path is not None
+        filename = os.path.basename(self.path)
+        ext = filename.split(".")[-1]
+        if filename.endswith(ext):
+            name = filename[:-len(ext)]
+        else: # should never hit this rather raise value error
+            name = filename
+        return name
+
+    def alias(self, benchmark_io : BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+
+@dataclass
+class IndexDescriptor(IndexBaseDescriptor):
+    codec_desc: Optional[CodecDescriptor] = None
+    database_desc: Optional[DatasetDescriptor] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_built(self):
+        return self.codec_desc is None and self.database_desc is None
+
+    def get_name(self) -> str:
+        if self.desc_name is None:
+            self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix="xb")
+
+        return self.desc_name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix="xb")
+        return self.flat_desc_name
+
+    # alias is used to refer when index is uploaded to blobstore and refered again
+    def alias(self, benchmark_io: BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+@dataclass
+class KnnDescriptor(IndexBaseDescriptor):
+    index_desc: Optional[IndexDescriptor] = None
+    gt_index_desc: Optional[IndexDescriptor] = None
+    query_dataset: Optional[DatasetDescriptor] = None
+    search_params: Optional[Dict[str, int]] = None
+    reconstruct: bool = False
+    # range metric definitions
+    # key: name
+    # value: one of the following:
+    #
+    # radius
+    #    [0..radius) -> 1
+    #    [radius..inf) -> 0
+    #
+    # [[radius1, score1], ...]
+    #    [0..radius1) -> score1
+    #    [radius1..radius2) -> score2
+    #
+    # [[radius1_from, radius1_to, score1], ...]
+    #    [radius1_from, radius1_to) -> score1,
+    #    [radius2_from, radius2_to) -> score2
+    range_metrics: Optional[Dict[str, Any]] = None
+    radius: Optional[float] = None
+    k: int = 1
+
+    range_ref_index_desc: Optional[str] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def get_name(self):
+        name = self.index_desc.get_name()
+        name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
+        name += self.query_dataset.get_filename("q")
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        return name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        name = self.index_desc.flat_name()
+        name += self.query_dataset.get_filename("q")
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        self.flat_desc_name = name
+        return name
diff --git a/thirdparty/faiss/benchs/bench_fw/index.py b/thirdparty/faiss/benchs/bench_fw/index.py
index 14f2158e6..6b6c2d93a 100644
--- a/thirdparty/faiss/benchs/bench_fw/index.py
+++ b/thirdparty/faiss/benchs/bench_fw/index.py
@@ -13,6 +13,7 @@
 
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
 import numpy as np
+from faiss.benchs.bench_fw.descriptors import IndexBaseDescriptor
 
 from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
     knn_intersection_measure,
@@ -49,35 +50,6 @@ class IndexBase:
     def set_io(self, benchmark_io):
         self.io = benchmark_io
 
-    @staticmethod
-    def param_dict_list_to_name(param_dict_list):
-        if not param_dict_list:
-            return ""
-        l = 0
-        n = ""
-        for param_dict in param_dict_list:
-            n += IndexBase.param_dict_to_name(param_dict, f"cp{l}")
-            l += 1
-        return n
-
-    @staticmethod
-    def param_dict_to_name(param_dict, prefix="sp"):
-        if not param_dict:
-            return ""
-        n = prefix
-        for name, val in param_dict.items():
-            if name == "snap":
-                continue
-            if name == "lsq_gpu" and val == 0:
-                continue
-            if name == "use_beam_LUT" and val == 0:
-                continue
-            n += f"_{name}_{val}"
-        if n == prefix:
-            return ""
-        n += "."
-        return n
-
     @staticmethod
     def set_index_param_dict_list(index, param_dict_list, assert_same=False):
         if not param_dict_list:
@@ -282,7 +254,7 @@ def get_knn_search_name(
         reconstruct: bool = False,
     ):
         name = self.get_index_name()
-        name += Index.param_dict_to_name(search_parameters)
+        name += IndexBaseDescriptor.param_dict_to_name(search_parameters)
         name += query_vectors.get_filename("q")
         name += f"k_{k}."
         name += f"t_{self.num_threads}."
@@ -495,7 +467,7 @@ def range_search(
         radius: Optional[float] = None,
     ):
         logger.info("range_search: begin")
-        if search_parameters is not None and search_parameters["snap"] == 1:
+        if search_parameters is not None and search_parameters.get("snap") == 1:
             query_vectors = self.snap(query_vectors)
         filename = (
             self.get_range_search_name(
@@ -582,14 +554,21 @@ class Index(IndexBase):
     num_threads: int
     d: int
     metric: str
-    database_vectors: DatasetDescriptor
-    construction_params: List[Dict[str, int]]
-    search_params: Dict[str, int]
+    codec_name: Optional[str] = None
+    index_name: Optional[str] = None
+    database_vectors: Optional[DatasetDescriptor] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    search_params: Optional[Dict[str, int]] = None
+    serialize_full_index: bool = False
+
+    bucket: Optional[str] = None
+    index_path: Optional[str] = None
 
     cached_codec: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
     cached_index: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
 
     def __post_init__(self):
+        logger.info(f"Initializing metric_type to {self.metric}")
         if isinstance(self.metric, str):
             if self.metric == "IP":
                 self.metric_type = faiss.METRIC_INNER_PRODUCT
@@ -628,13 +607,31 @@ def get_codec(self):
                 Index.cached_codec.popitem(last=False)
         return Index.cached_codec[codec_name]
 
-    def get_index_name(self):
-        name = self.get_codec_name()
-        assert self.database_vectors is not None
-        name += self.database_vectors.get_filename("xb")
-        return name
+    def get_codec_name(self) -> Optional[str]:
+        return self.codec_name
+
+    def get_index_name(self) -> Optional[str]:
+        return self.index_name
 
     def fetch_index(self):
+        # read index from file if it is already available
+        if self.index_path:
+            index_filename = os.path.basename(self.index_path)
+        else:
+            index_filename = self.index_name + "index"
+        if self.io.file_exist(index_filename):
+            if self.index_path:
+                index = self.io.read_index(
+                    index_filename,
+                    self.bucket,
+                    os.path.dirname(self.index_path),
+                )
+            else:
+                index = self.io.read_index(index_filename)
+            assert self.d == index.d
+            assert self.metric_type == index.metric_type
+            return index, 0
+
         index = self.get_codec()
         index.reset()
         assert index.ntotal == 0
@@ -664,10 +661,15 @@ def fetch_index(self):
             )
         assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0]
         logger.info("Added vectors to index")
+        if self.serialize_full_index:
+            codec_size = self.io.write_index(index, index_filename)
+            assert codec_size is not None
+
         return index, t
 
     def get_index(self):
-        index_name = self.get_index_name()
+        index_name = self.index_name
+        # TODO(kuarora) : retrieve file from bucket and path.
         if index_name not in Index.cached_index:
             Index.cached_index[index_name], _ = self.fetch_index()
             if len(Index.cached_index) > 3:
@@ -776,13 +778,20 @@ def add_range_or_val(name, range):
             )
         return op
 
+    def is_flat_index(self):
+        return self.get_index_name().startswith("Flat")
+
 
 # IndexFromCodec, IndexFromQuantizer and IndexFromPreTransform
 # are used to wrap pre-trained Faiss indices (codecs)
 @dataclass
 class IndexFromCodec(Index):
-    path: str
-    bucket: Optional[str] = None
+    path: Optional[str] = None
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self.path is None:
+            raise ValueError("path is not set")
 
     def get_quantizer(self):
         if not self.is_ivf():
@@ -801,11 +810,8 @@ def get_pretransform(self):
     def get_model_name(self):
         return os.path.basename(self.path)
 
-    def get_codec_name(self):
-        assert self.path is not None
-        name = os.path.basename(self.path)
-        name += Index.param_dict_list_to_name(self.construction_params)
-        return name
+    def fetch_meta(self, dry_run=False):
+        return None, None
 
     def fetch_codec(self):
         codec = self.io.read_index(
@@ -865,20 +871,15 @@ def get_codec(self):
 # IndexFromFactory is for creating and training indices from scratch
 @dataclass
 class IndexFromFactory(Index):
-    factory: str
-    training_vectors: DatasetDescriptor
+    factory: Optional[str] = None
+    training_vectors: Optional[DatasetDescriptor] = None
 
-    def get_codec_name(self):
-        assert self.factory is not None
-        name = f"{self.factory.replace(',', '_')}."
-        assert self.d is not None
-        assert self.metric is not None
-        name += f"d_{self.d}.{self.metric.upper()}."
-        if self.factory != "Flat":
-            assert self.training_vectors is not None
-            name += self.training_vectors.get_filename("xt")
-        name += Index.param_dict_list_to_name(self.construction_params)
-        return name
+    def __post_init__(self):
+        super().__post_init__()
+        if self.factory is None:
+            raise ValueError("factory is not set")
+        if self.factory != "Flat" and self.training_vectors is None:
+            raise ValueError(f"training_vectors is not set for {self.factory}")
 
     def fetch_meta(self, dry_run=False):
         meta_filename = self.get_codec_name() + "json"
@@ -911,7 +912,7 @@ def fetch_codec(self, dry_run=False):
             assert codec_size is not None
             meta = {
                 "training_time": training_time,
-                "training_size": self.training_vectors.num_vectors,
+                "training_size": self.training_vectors.num_vectors if self.training_vectors else 0,
                 "codec_size": codec_size,
                 "sa_code_size": self.get_sa_code_size(codec),
                 "code_size": self.get_code_size(codec),
diff --git a/thirdparty/faiss/benchs/bench_fw/optimize.py b/thirdparty/faiss/benchs/bench_fw/optimize.py
index 473436ea6..ac6c45ab0 100644
--- a/thirdparty/faiss/benchs/bench_fw/optimize.py
+++ b/thirdparty/faiss/benchs/bench_fw/optimize.py
@@ -14,7 +14,7 @@
 # )
 
 from .benchmark import Benchmark
-from .descriptors import DatasetDescriptor, IndexDescriptor
+from .descriptors import DatasetDescriptor, IndexDescriptorClassic
 from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
 
 logger = logging.getLogger(__name__)
@@ -78,7 +78,7 @@ def benchmark_and_filter_candidates(
         )
         assert filtered
         index_descs = [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=v["factory"],
                 construction_params=v["construction_params"],
                 search_params=v["search_params"],
@@ -103,8 +103,8 @@ def optimize_quantizer(
                 dry_run=False,
             )
 
-            descs = [IndexDescriptor(factory="Flat"),] + [
-                IndexDescriptor(
+            descs = [IndexDescriptorClassic(factory="Flat"),] + [
+                IndexDescriptorClassic(
                     factory="HNSW32",
                     construction_params=[{"efConstruction": 2**i}],
                 )
@@ -131,7 +131,7 @@ def optimize_ivf(
         training_vectors: DatasetDescriptor,
         database_vectors: DatasetDescriptor,
         query_vectors: DatasetDescriptor,
-        quantizers: Dict[int, List[IndexDescriptor]],
+        quantizers: Dict[int, List[IndexDescriptorClassic]],
         codecs: List[Tuple[str, str]],
         min_accuracy: float,
     ):
@@ -159,7 +159,7 @@ def optimize_ivf(
                                 quantizer_desc.search_params,
                             )
                     ivf_descs.append(
-                        IndexDescriptor(
+                        IndexDescriptorClassic(
                             factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
                             construction_params=construction_params,
                         )
@@ -188,7 +188,7 @@ def ivf_flat_nprobe_required_for_accuracy(
     ):
         _, results = self.benchmark_and_filter_candidates(
             index_descs=[
-                IndexDescriptor(factory=f"IVF{nlist}(Flat),Flat"),
+                IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
             ],
             training_vectors=training_vectors,
             database_vectors=database_vectors,
@@ -226,7 +226,9 @@ def optimize_codec(
             [
                 (None, "Flat"),
                 (None, "SQfp16"),
+                (None, "SQbf16"),
                 (None, "SQ8"),
+                (None, "SQ8_direct_signed"),
             ] + [
                 (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
                 for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
@@ -254,7 +256,7 @@ def optimize_codec(
 
         _, filtered = self.benchmark_and_filter_candidates(
             index_descs=[
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{nlist},{pq}"
                     if opq is None
                     else f"{opq},IVF{nlist},{pq}",
diff --git a/thirdparty/faiss/benchs/bench_fw_codecs.py b/thirdparty/faiss/benchs/bench_fw_codecs.py
index 80741e23f..d3efc2da0 100644
--- a/thirdparty/faiss/benchs/bench_fw_codecs.py
+++ b/thirdparty/faiss/benchs/bench_fw_codecs.py
@@ -7,10 +7,10 @@
 import argparse
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
-from bench_fw.index import IndexFromFactory
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
+from faiss.benchs.bench_fw.index import IndexFromFactory
 
 logging.basicConfig(level=logging.INFO)
 
@@ -107,7 +107,7 @@ def run_local(rp):
         database_vectors=database_vectors,
         query_vectors=query_vectors,
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=factory,
                 construction_params=construction_params,
                 training_size=training_size,
diff --git a/thirdparty/faiss/benchs/bench_fw_ivf.py b/thirdparty/faiss/benchs/bench_fw_ivf.py
index 8c84743e2..b0c108b7d 100644
--- a/thirdparty/faiss/benchs/bench_fw_ivf.py
+++ b/thirdparty/faiss/benchs/bench_fw_ivf.py
@@ -3,16 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptorClassic,
+)
 
 logging.basicConfig(level=logging.INFO)
 
+
 def sift1M(bio):
     benchmark = Benchmark(
         num_threads=32,
@@ -26,7 +30,7 @@ def sift1M(bio):
             namespace="std_q", tablename="sift1M"
         ),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},Flat",
             )
             for nlist in range(8, 15)
@@ -34,8 +38,9 @@ def sift1M(bio):
         k=1,
         distance_metric="L2",
     )
-    benchmark.set_io(bio)
-    benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+    benchmark.io = bio
+    benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False)
+
 
 def bigann(bio):
     for scale in [1, 2, 5, 10, 20, 50]:
@@ -51,11 +56,11 @@ def bigann(bio):
                 namespace="std_q", tablename="bigann1M"
             ),
             index_descs=[
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{2 ** nlist},Flat",
                 ) for nlist in range(11, 19)
             ] + [
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{2 ** nlist}_HNSW32,Flat",
                     construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
                 ) for nlist in range(11, 19)
@@ -79,18 +84,18 @@ def ssnpp(bio):
             tablename="ssnpp_queries_10K.npy"
         ),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
             ) for nlist in range(9, 16)
         ] + [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},Flat",
             ) for nlist in range(9, 16)
         ] + [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"PQ256x4fs,Refine(SQfp16)",
             ),
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"HNSW32",
             ),
         ],
diff --git a/thirdparty/faiss/benchs/bench_fw_notebook.ipynb b/thirdparty/faiss/benchs/bench_fw_notebook.ipynb
index 5752aaf5f..c38ed1106 100644
--- a/thirdparty/faiss/benchs/bench_fw_notebook.ipynb
+++ b/thirdparty/faiss/benchs/bench_fw_notebook.ipynb
@@ -1,529 +1,532 @@
 {
-    "cells": [
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "be081589-e1b2-4569-acb7-44203e273899",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "import matplotlib.pyplot as plt\n",
-       "import itertools\n",
-       "from faiss.contrib.evaluation import OperatingPoints\n",
-       "from enum import Enum\n",
-       "from bench_fw.benchmark_io import BenchmarkIO as BIO\n",
-       "from bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
-       "from copy import copy\n",
-       "import numpy as np\n",
-       "import datetime\n",
-       "import glob\n",
-       "import io\n",
-       "import json\n",
-       "from zipfile import ZipFile\n",
-       "import tabulate"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "root = \"/checkpoint/gsz/bench_fw/optimize/bigann\"\n",
-       "results = BIO(root).read_json(\"result_std_d_bigann10M.json\")\n",
-       "results.keys()"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "0875d269-aef4-426d-83dd-866970f43777",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "results['experiments']"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f080a6e2-1565-418b-8732-4adeff03a099",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
-       "    if plot is None:\n",
-       "        plot = plt.subplot()\n",
-       "    x = {}\n",
-       "    y = {}\n",
-       "    for accuracy, space, time, k, v in experiments:\n",
-       "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
-       "        if idx_name not in x:\n",
-       "            x[idx_name] = []\n",
-       "            y[idx_name] = []\n",
-       "        x[idx_name].append(accuracy)\n",
-       "        if plot_space:\n",
-       "            y[idx_name].append(space)\n",
-       "        else:\n",
-       "            y[idx_name].append(time)\n",
-       "\n",
-       "    #plt.figure(figsize=(10,6))\n",
-       "    #plt.title(accuracy_title)\n",
-       "    plot.set_xlabel(accuracy_title)\n",
-       "    plot.set_ylabel(cost_title)\n",
-       "    plot.set_yscale(\"log\")\n",
-       "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-       "    for index in x.keys():\n",
-       "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
-       "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "61007155-5edc-449e-835e-c141a01a2ae5",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "# index local optima\n",
-       "accuracy_metric = \"knn_intersection\"\n",
-       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "# global optima\n",
-       "accuracy_metric = \"knn_intersection\"\n",
-       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def pretty_params(p):\n",
-       "    p = copy(p)\n",
-       "    if 'snap' in p and p['snap'] == 0:\n",
-       "        del p['snap']\n",
-       "    return p\n",
-       "    \n",
-       "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
-       "                for accuracy, space, time, k, v in fr],\n",
-       "                tablefmt=\"html\",\n",
-       "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "36e82084-18f6-4546-a717-163eb0224ee8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# index local optima @ precision 0.8\n",
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# index local optima @ precision 0.2\n",
-       "precision = 0.2\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# global optima @ precision 0.8\n",
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def plot_range_search_pr_curves(experiments):\n",
-       "    x = {}\n",
-       "    y = {}\n",
-       "    show = {\n",
-       "        'Flat': None,\n",
-       "    }\n",
-       "    for _, _, _, k, v in fr:\n",
-       "        if \".weighted\" in k: # and v['index'] in show:\n",
-       "            x[k] = v['range_search_pr']['recall']\n",
-       "            y[k] = v['range_search_pr']['precision']\n",
-       "    \n",
-       "    plt.title(\"range search recall\")\n",
-       "    plt.xlabel(\"recall\")\n",
-       "    plt.ylabel(\"precision\")\n",
-       "    for index in x.keys():\n",
-       "        plt.plot(x[index], y[index], '.', label=index)\n",
-       "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
-       "plot_range_search_pr_curves(fr)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-       "fig.tight_layout()\n",
-       "for plot, scale in zip(plots, scales, strict=True):\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e503828c-ee61-45f7-814b-cce6461109bc",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "x = {}\n",
-       "y = {}\n",
-       "accuracy=0.9\n",
-       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-       "#fig.tight_layout()\n",
-       "for scale in scales:\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    scale *= 1_000_000\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    seen = set()\n",
-       "    print(scale)\n",
-       "    for _, _, _, _, exp in fr:\n",
-       "        fact = exp[\"factory\"]\n",
-       "        # \"HNSW\" in fact or \n",
-       "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-       "            continue\n",
-       "        seen.add(fact)\n",
-       "        if fact not in x:\n",
-       "            x[fact] = []\n",
-       "            y[fact] = []\n",
-       "        x[fact].append(scale)\n",
-       "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
-       "        if (exp[\"knn_intersection\"] > 0.92):\n",
-       "            print(fact)\n",
-       "            print(exp[\"search_params\"])\n",
-       "            print(exp[\"knn_intersection\"])\n",
-       "\n",
-       "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
-       "    \n",
-       "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
-       "plt.xlabel(\"database size\")\n",
-       "plt.ylabel(\"time\")\n",
-       "plt.xscale(\"log\")\n",
-       "plt.yscale(\"log\")\n",
-       "\n",
-       "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-       "for index in x.keys():\n",
-       "    if \"HNSW\" in index:\n",
-       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
-       "    else:\n",
-       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
-       "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# global optima\n",
-       "accuracy_metric = \"sym_recall\"\n",
-       "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def pretty_time(s):\n",
-       "    if s is None:\n",
-       "        return \"None\"\n",
-       "    s = int(s * 1000) / 1000\n",
-       "    m, s = divmod(s, 60)\n",
-       "    h, m = divmod(m, 60)\n",
-       "    d, h = divmod(h, 24)\n",
-       "    r = \"\"\n",
-       "    if d > 0:\n",
-       "        r += f\"{int(d)}d \"\n",
-       "    if h > 0:\n",
-       "        r += f\"{int(h)}h \"\n",
-       "    if m > 0:\n",
-       "        r += f\"{int(m)}m \"\n",
-       "    if s > 0 or len(r) == 0:\n",
-       "        r += f\"{s:.3f}s\"\n",
-       "    return r\n",
-       "\n",
-       "def pretty_size(s):\n",
-       "    if s > 1024 * 1024:\n",
-       "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
-       "    if s > 1024:\n",
-       "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
-       "    return f\"{s}\"\n",
-       "\n",
-       "def pretty_mse(m):\n",
-       "    if m is None:\n",
-       "        return \"None\"\n",
-       "    else:\n",
-       "        return f\"{m:.6f}\""
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "data = {}\n",
-       "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "for scale in scales:\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    d = {}\n",
-       "    data[f\"{scale}M\"] = d\n",
-       "    for _, _, _, _, exp in fr:\n",
-       "        fact = exp[\"factory\"]\n",
-       "        # \"HNSW\" in fact or \n",
-       "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-       "            continue\n",
-       "        if fact not in d:\n",
-       "            d[fact] = []\n",
-       "        d[fact].append({\n",
-       "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
-       "            \"recall\": exp[\"knn_intersection\"],\n",
-       "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
-       "        })\n",
-       "data\n",
-       "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
-       "#    json.dump(data, f)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "ds = \"deep1b\"\n",
-       "data = []\n",
-       "jss = []\n",
-       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-       "results = BIO(root).read_json(f\"result.json\")\n",
-       "for k, e in results[\"experiments\"].items():\n",
-       "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-       "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
-       "        codec_size = results['indices'][e['codec']]['codec_size']\n",
-       "        training_time = results['indices'][e['codec']]['training_time']\n",
-       "        # training_size = results['indices'][e['codec']]['training_size']\n",
-       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-       "        jss.append({\n",
-       "            'factory': e['factory'],\n",
-       "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
-       "            'evaluation_params': e['reconstruct_params'],\n",
-       "            'code_size': code_size,\n",
-       "            'codec_size': codec_size,\n",
-       "            'training_time': training_time,\n",
-       "            'training_size': training_size,\n",
-       "            'mse': e['mse'],\n",
-       "            'sym_recall': e['sym_recall'],\n",
-       "            'asym_recall': e['asym_recall'],\n",
-       "            'encode_time': e['encode_time'],\n",
-       "            'decode_time': e['decode_time'],\n",
-       "            'cpu': cpu,\n",
-       "        })\n",
-       "\n",
-       "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-       "data.sort()\n",
-       "for d in data:\n",
-       "    print(d[1])\n",
-       "\n",
-       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
-       "    json.dump(jss, f)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def read_file(filename: str, keys):\n",
-       "    results = []\n",
-       "    with ZipFile(filename, \"r\") as zip_file:\n",
-       "        for key in keys:\n",
-       "            with zip_file.open(key, \"r\") as f:\n",
-       "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
-       "                    results.append(np.load(f))\n",
-       "                elif key in [\"P\"]:\n",
-       "                    t = io.TextIOWrapper(f)\n",
-       "                    results.append(json.load(t))\n",
-       "                else:\n",
-       "                    raise AssertionError()\n",
-       "    return results"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "ds = \"contriever\"\n",
-       "data = []\n",
-       "jss = []\n",
-       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-       "for lf in glob.glob(root + '/*rec*.zip'):\n",
-       "    e, = read_file(lf, ['P'])\n",
-       "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-       "        code_size = e['codec_meta']['sa_code_size']\n",
-       "        codec_size = e['codec_meta']['codec_size']\n",
-       "        training_time = e['codec_meta']['training_time']\n",
-       "        training_size = None # e['codec_meta']['training_size']\n",
-       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-       "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
-       "           eps = \" \"\n",
-       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-       "        eps = e['reconstruct_params']\n",
-       "        del eps['snap']\n",
-       "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
-       "        for k, v in e['reconstruct_params'].items():\n",
-       "            params[k] = v\n",
-       "        jss.append({\n",
-       "            'factory': e['factory'],\n",
-       "            'params': params,\n",
-       "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
-       "            'evaluation_params': e['reconstruct_params'],\n",
-       "            'code_size': code_size,\n",
-       "            'codec_size': codec_size,\n",
-       "            'training_time': training_time,\n",
-       "            # 'training_size': training_size,\n",
-       "            'mse': e['mse'],\n",
-       "            'sym_recall': e['sym_recall'],\n",
-       "            'asym_recall': e['asym_recall'],\n",
-       "            'encode_time': e['encode_time'],\n",
-       "            'decode_time': e['decode_time'],\n",
-       "            'cpu': cpu,\n",
-       "        })\n",
-       "\n",
-       "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-       "data.sort()\n",
-       "# for d in data:\n",
-       "#   print(d[1])\n",
-       "\n",
-       "print(len(data))\n",
-       "\n",
-       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
-       "    json.dump(jss, f)"
-      ]
-     }
-    ],
-    "metadata": {
-     "kernelspec": {
-      "display_name": "Python [conda env:.conda-faiss_from_source] *",
-      "language": "python",
-      "name": "conda-env-.conda-faiss_from_source-py"
-     },
-     "language_info": {
-      "codemirror_mode": {
-       "name": "ipython",
-       "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.11.5"
-     }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-   }
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "be081589-e1b2-4569-acb7-44203e273899",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import itertools\n",
+    "from faiss.contrib.evaluation import OperatingPoints\n",
+    "from enum import Enum\n",
+    "from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO as BIO\n",
+    "from faiss.benchs.bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
+    "from copy import copy\n",
+    "import numpy as np\n",
+    "import datetime\n",
+    "import glob\n",
+    "import io\n",
+    "import json\n",
+    "from zipfile import ZipFile\n",
+    "import tabulate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "username = getpass.getuser()\n",
+    "root = f\"/home/{username}/simsearch/data/ivf/results/sift1M\"\n",
+    "results = BIO(root).read_json(\"result.json\")\n",
+    "results.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0875d269-aef4-426d-83dd-866970f43777",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "results['experiments']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f080a6e2-1565-418b-8732-4adeff03a099",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
+    "    if plot is None:\n",
+    "        plot = plt.subplot()\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    for accuracy, space, time, k, v in experiments:\n",
+    "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
+    "        if idx_name not in x:\n",
+    "            x[idx_name] = []\n",
+    "            y[idx_name] = []\n",
+    "        x[idx_name].append(accuracy)\n",
+    "        if plot_space:\n",
+    "            y[idx_name].append(space)\n",
+    "        else:\n",
+    "            y[idx_name].append(time)\n",
+    "\n",
+    "    #plt.figure(figsize=(10,6))\n",
+    "    #plt.title(accuracy_title)\n",
+    "    plot.set_xlabel(accuracy_title)\n",
+    "    plot.set_ylabel(cost_title)\n",
+    "    plot.set_yscale(\"log\")\n",
+    "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "    for index in x.keys():\n",
+    "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
+    "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61007155-5edc-449e-835e-c141a01a2ae5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# index local optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.25, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "#fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_params(p):\n",
+    "    p = copy(p)\n",
+    "    if 'snap' in p and p['snap'] == 0:\n",
+    "        del p['snap']\n",
+    "    return p\n",
+    "    \n",
+    "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
+    "                for accuracy, space, time, k, v in fr],\n",
+    "                tablefmt=\"html\",\n",
+    "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36e82084-18f6-4546-a717-163eb0224ee8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.2\n",
+    "precision = 0.2\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_range_search_pr_curves(experiments):\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    show = {\n",
+    "        'Flat': None,\n",
+    "    }\n",
+    "    for _, _, _, k, v in fr:\n",
+    "        if \".weighted\" in k: # and v['index'] in show:\n",
+    "            x[k] = v['range_search_pr']['recall']\n",
+    "            y[k] = v['range_search_pr']['precision']\n",
+    "    \n",
+    "    plt.title(\"range search recall\")\n",
+    "    plt.xlabel(\"recall\")\n",
+    "    plt.ylabel(\"precision\")\n",
+    "    for index in x.keys():\n",
+    "        plt.plot(x[index], y[index], '.', label=index)\n",
+    "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
+    "plot_range_search_pr_curves(fr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "fig.tight_layout()\n",
+    "for plot, scale in zip(plots, scales, strict=True):\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e503828c-ee61-45f7-814b-cce6461109bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = {}\n",
+    "y = {}\n",
+    "accuracy=0.9\n",
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "#fig.tight_layout()\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    scale *= 1_000_000\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    seen = set()\n",
+    "    print(scale)\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        seen.add(fact)\n",
+    "        if fact not in x:\n",
+    "            x[fact] = []\n",
+    "            y[fact] = []\n",
+    "        x[fact].append(scale)\n",
+    "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
+    "        if (exp[\"knn_intersection\"] > 0.92):\n",
+    "            print(fact)\n",
+    "            print(exp[\"search_params\"])\n",
+    "            print(exp[\"knn_intersection\"])\n",
+    "\n",
+    "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
+    "    \n",
+    "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
+    "plt.xlabel(\"database size\")\n",
+    "plt.ylabel(\"time\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "\n",
+    "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "for index in x.keys():\n",
+    "    if \"HNSW\" in index:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
+    "    else:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
+    "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"sym_recall\"\n",
+    "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_time(s):\n",
+    "    if s is None:\n",
+    "        return \"None\"\n",
+    "    s = int(s * 1000) / 1000\n",
+    "    m, s = divmod(s, 60)\n",
+    "    h, m = divmod(m, 60)\n",
+    "    d, h = divmod(h, 24)\n",
+    "    r = \"\"\n",
+    "    if d > 0:\n",
+    "        r += f\"{int(d)}d \"\n",
+    "    if h > 0:\n",
+    "        r += f\"{int(h)}h \"\n",
+    "    if m > 0:\n",
+    "        r += f\"{int(m)}m \"\n",
+    "    if s > 0 or len(r) == 0:\n",
+    "        r += f\"{s:.3f}s\"\n",
+    "    return r\n",
+    "\n",
+    "def pretty_size(s):\n",
+    "    if s > 1024 * 1024:\n",
+    "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
+    "    if s > 1024:\n",
+    "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
+    "    return f\"{s}\"\n",
+    "\n",
+    "def pretty_mse(m):\n",
+    "    if m is None:\n",
+    "        return \"None\"\n",
+    "    else:\n",
+    "        return f\"{m:.6f}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {}\n",
+    "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    d = {}\n",
+    "    data[f\"{scale}M\"] = d\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        if fact not in d:\n",
+    "            d[fact] = []\n",
+    "        d[fact].append({\n",
+    "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
+    "            \"recall\": exp[\"knn_intersection\"],\n",
+    "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
+    "        })\n",
+    "data\n",
+    "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
+    "#    json.dump(data, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"deep1b\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "results = BIO(root).read_json(f\"result.json\")\n",
+    "for k, e in results[\"experiments\"].items():\n",
+    "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
+    "        codec_size = results['indices'][e['codec']]['codec_size']\n",
+    "        training_time = results['indices'][e['codec']]['training_time']\n",
+    "        # training_size = results['indices'][e['codec']]['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "for d in data:\n",
+    "    print(d[1])\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_file(filename: str, keys):\n",
+    "    results = []\n",
+    "    with ZipFile(filename, \"r\") as zip_file:\n",
+    "        for key in keys:\n",
+    "            with zip_file.open(key, \"r\") as f:\n",
+    "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
+    "                    results.append(np.load(f))\n",
+    "                elif key in [\"P\"]:\n",
+    "                    t = io.TextIOWrapper(f)\n",
+    "                    results.append(json.load(t))\n",
+    "                else:\n",
+    "                    raise AssertionError()\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"contriever\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "for lf in glob.glob(root + '/*rec*.zip'):\n",
+    "    e, = read_file(lf, ['P'])\n",
+    "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = e['codec_meta']['sa_code_size']\n",
+    "        codec_size = e['codec_meta']['codec_size']\n",
+    "        training_time = e['codec_meta']['training_time']\n",
+    "        training_size = None # e['codec_meta']['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
+    "           eps = \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        eps = e['reconstruct_params']\n",
+    "        del eps['snap']\n",
+    "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
+    "        for k, v in e['reconstruct_params'].items():\n",
+    "            params[k] = v\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'params': params,\n",
+    "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            # 'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "# for d in data:\n",
+    "#   print(d[1])\n",
+    "\n",
+    "print(len(data))\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "faiss_binary (local)",
+   "language": "python",
+   "name": "faiss_binary_local"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/thirdparty/faiss/benchs/bench_fw_optimize.py b/thirdparty/faiss/benchs/bench_fw_optimize.py
index 31b56f9f5..11e625e23 100644
--- a/thirdparty/faiss/benchs/bench_fw_optimize.py
+++ b/thirdparty/faiss/benchs/bench_fw_optimize.py
@@ -7,9 +7,9 @@
 import logging
 import os
 
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor
-from bench_fw.optimize import Optimizer
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor
+from faiss.benchs.bench_fw.optimize import Optimizer
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/thirdparty/faiss/benchs/bench_fw_range.py b/thirdparty/faiss/benchs/bench_fw_range.py
index f38de114f..0d4b65afa 100644
--- a/thirdparty/faiss/benchs/bench_fw_range.py
+++ b/thirdparty/faiss/benchs/bench_fw_range.py
@@ -3,28 +3,29 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
 
 logging.basicConfig(level=logging.INFO)
 
+
 def ssnpp(bio):
     benchmark = Benchmark(
         num_threads=32,
         training_vectors=DatasetDescriptor(
-            tablename="ssnpp_training_5M.npy",
+            tablename="training.npy",
         ),
         database_vectors=DatasetDescriptor(
-            tablename="ssnpp_xb_range_filtered_119201.npy",
+            tablename="database.npy",
         ),
-        query_vectors=DatasetDescriptor(tablename="ssnpp_xq_range_filtered_33615.npy"),
+        query_vectors=DatasetDescriptor(tablename="query.npy"),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory="Flat",
                 range_metrics={
                     "weighted": [
@@ -56,7 +57,7 @@ def ssnpp(bio):
                     ]
                 },
             ),
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory="IVF262144(PQ256x4fs),PQ32",
             ),
         ],
@@ -67,6 +68,7 @@ def ssnpp(bio):
     benchmark.set_io(bio)
     benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('experiment')
diff --git a/thirdparty/faiss/c_api/IndexScalarQuantizer_c.h b/thirdparty/faiss/c_api/IndexScalarQuantizer_c.h
index becdb201e..5c6694695 100644
--- a/thirdparty/faiss/c_api/IndexScalarQuantizer_c.h
+++ b/thirdparty/faiss/c_api/IndexScalarQuantizer_c.h
@@ -26,6 +26,9 @@ typedef enum FaissQuantizerType {
     QT_fp16,
     QT_8bit_direct, ///< fast indexing of uint8s
     QT_6bit,        ///< 6 bits per component
+    QT_bf16,
+    QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from [-128
+                           ///< to 127]
 } FaissQuantizerType;
 
 // forward declaration
diff --git a/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
index c43e7656c..1dde8e986 100644
--- a/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
@@ -48,21 +48,25 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
@@ -85,13 +89,18 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - packaging
diff --git a/thirdparty/faiss/conda/faiss-gpu/build-lib.sh b/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
index 2d25e9c5e..9957be96e 100755
--- a/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
+++ b/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
@@ -6,6 +6,12 @@
 
 set -e
 
+# Workaround for CUDA 11.4.4 builds. Moves all necessary headers to include root.
+if [ -n "$FAISS_FLATTEN_CONDA_INCLUDES" ] && [ "$FAISS_FLATTEN_CONDA_INCLUDES" = "1" ]; then
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/"* "$CONDA_PREFIX/include/"
+fi
 
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
diff --git a/thirdparty/faiss/conda/faiss-gpu/meta.yaml b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
index b0df70718..05f7b5900 100644
--- a/thirdparty/faiss/conda/faiss-gpu/meta.yaml
+++ b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
@@ -43,12 +43,13 @@ outputs:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
         - CUDA_ARCHS
+        - FAISS_FLATTEN_CONDA_INCLUDES
     requirements:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
@@ -81,8 +82,9 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - python {{ python }}
         - numpy >=1.19,<2
diff --git a/thirdparty/faiss/conda/faiss/meta.yaml b/thirdparty/faiss/conda/faiss/meta.yaml
index c4d66ca0d..79e7be953 100644
--- a/thirdparty/faiss/conda/faiss/meta.yaml
+++ b/thirdparty/faiss/conda/faiss/meta.yaml
@@ -39,7 +39,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
       host:
@@ -69,7 +69,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}
diff --git a/thirdparty/faiss/contrib/datasets.py b/thirdparty/faiss/contrib/datasets.py
index f37a2fb6e..281f16e2f 100644
--- a/thirdparty/faiss/contrib/datasets.py
+++ b/thirdparty/faiss/contrib/datasets.py
@@ -6,6 +6,8 @@
 import os
 import numpy as np
 import faiss
+import getpass
+
 
 from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
 from .exhaustive_search import knn
@@ -115,10 +117,12 @@ def get_groundtruth(self, k=100):
 # that directory is
 ############################################################################
 
+username = getpass.getuser()
 
 for dataset_basedir in (
         '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/'):
+        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
+        f'/home/{username}/simsearch/data/'):
     if os.path.exists(dataset_basedir):
         break
 else:
diff --git a/thirdparty/faiss/contrib/factory_tools.py b/thirdparty/faiss/contrib/factory_tools.py
index da90e986f..dde312b02 100644
--- a/thirdparty/faiss/contrib/factory_tools.py
+++ b/thirdparty/faiss/contrib/factory_tools.py
@@ -56,6 +56,8 @@ def get_code_size(d, indexkey):
         return (d * 6 + 7) // 8
     elif indexkey == 'SQfp16':
         return d * 2
+    elif indexkey == 'SQbf16':
+        return d * 2
 
     mo = re.match('PCAR?(\\d+),(.*)$', indexkey)
     if mo:
@@ -123,6 +125,7 @@ def reverse_index_factory(index):
             faiss.ScalarQuantizer.QT_4bit: "4",
             faiss.ScalarQuantizer.QT_6bit: "6",
             faiss.ScalarQuantizer.QT_fp16: "fp16",
+            faiss.ScalarQuantizer.QT_bf16: "bf16",
         }
         return f"SQ{sqtypes[index.sq.qtype]}"
 
diff --git a/thirdparty/faiss/contrib/vecs_io.py b/thirdparty/faiss/contrib/vecs_io.py
index 5d18c0b16..9ef9e0ab6 100644
--- a/thirdparty/faiss/contrib/vecs_io.py
+++ b/thirdparty/faiss/contrib/vecs_io.py
@@ -14,7 +14,7 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
@@ -25,7 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
-    assert not sys.big_endian
+    assert sys.byteorder != 'big'
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -37,7 +37,7 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         da = x[:4][::-1].copy()
         d = da.view('int32')[0]
     else:
@@ -50,7 +50,7 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         m1.byteswap(inplace=True)
     m1.tofile(fname)
 
diff --git a/thirdparty/faiss/faiss/IndexFlat.cpp b/thirdparty/faiss/faiss/IndexFlat.cpp
index bb7367cd5..5f2465228 100644
--- a/thirdparty/faiss/faiss/IndexFlat.cpp
+++ b/thirdparty/faiss/faiss/IndexFlat.cpp
@@ -74,10 +74,18 @@ void IndexFlat::search(
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_jaccard(x, get_xb(), d, n, ntotal, &res, sel);        
     } else {
-        FAISS_THROW_IF_NOT(!sel);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+                x,
+                get_xb(),
+                d,
+                n,
+                ntotal,
+                metric_type,
+                metric_arg,
+                k,
+                distances,
+                labels,
+                sel);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/IndexHNSW.cpp b/thirdparty/faiss/faiss/IndexHNSW.cpp
index 3325c8c0e..c0bb81c05 100644
--- a/thirdparty/faiss/faiss/IndexHNSW.cpp
+++ b/thirdparty/faiss/faiss/IndexHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -17,7 +15,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <limits>
+#include <memory>
 #include <queue>
+#include <random>
 #include <unordered_set>
 
 #include <sys/stat.h>
@@ -68,52 +69,6 @@ HNSWStats hnsw_stats;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        basedis->distances_batch_4(
-                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
-        dis0 = -dis0;
-        dis1 = -dis1;
-        dis2 = -dis2;
-        dis3 = -dis3;
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    virtual ~NegativeDistanceComputer() {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
@@ -192,7 +147,9 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+        for (int pt_level = hist.size() - 1;
+             pt_level >= !index_hnsw.init_level0;
+             pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -228,7 +185,13 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    hnsw.add_with_locks(
+                            *dis,
+                            pt_level,
+                            pt_id,
+                            locks,
+                            vt,
+                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -248,7 +211,11 @@ void hnsw_add_vertices(
             }
             i1 = i0;
         }
-        FAISS_ASSERT(i1 == 0);
+        if (index_hnsw.init_level0) {
+            FAISS_ASSERT(i1 == 0);
+        } else {
+            FAISS_ASSERT((i1 - hist[0]) == 0);
+        }
     }
     if (verbose) {
         printf("Done in %.3f ms\n", getmillisecs() - t0);
@@ -297,7 +264,8 @@ void hnsw_search(
         const SearchParameters* params_in) {
     FAISS_THROW_IF_NOT_MSG(
             index->storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+            "No storage index, please use IndexHNSWFlat (or variants) "
+            "instead of IndexHNSW directly");
     const SearchParametersHNSW* params = nullptr;
     const HNSW& hnsw = index->hnsw;
 
@@ -451,10 +419,18 @@ void IndexHNSW::search_level_0(
         float* distances,
         idx_t* labels,
         int nprobe,
-        int search_type) const {
+        int search_type,
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const SearchParametersHNSW* params = nullptr;
+
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+    }
+
     storage_idx_t ntotal = hnsw.levels.size();
 
     using RH = HeapBlockResultHandler<HNSW::C>;
@@ -481,13 +457,21 @@ void IndexHNSW::search_level_0(
                     nearest_d + i * nprobe,
                     search_type,
                     search_stats,
-                    vt);
+                    vt,
+                    params);
             res.end();
             vt.advance();
         }
 #pragma omp critical
         { hnsw_stats.combine(search_stats); }
     }
+    if (is_similarity_metric(this->metric_type)) {
+// we need to revert the negated distances
+#pragma omp parallel for
+        for (int64_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
@@ -910,4 +894,86 @@ void IndexHNSW2Level::flip_to_ivf() {
     delete storage2l;
 }
 
+/**************************************************************
+ * IndexHNSWCagra implementation
+ **************************************************************/
+
+IndexHNSWCagra::IndexHNSWCagra() {
+    is_trained = true;
+}
+
+IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
+        : IndexHNSW(
+                  (metric == METRIC_L2)
+                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
+                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
+                  M) {
+    FAISS_THROW_IF_NOT_MSG(
+            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
+            "unsupported metric type for IndexHNSWCagra");
+    own_fields = true;
+    is_trained = true;
+    init_level0 = true;
+    keep_max_size_level0 = true;
+}
+
+void IndexHNSWCagra::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            !base_level_only,
+            "Cannot add vectors when base_level_only is set to True");
+
+    IndexHNSW::add(n, x);
+}
+
+void IndexHNSWCagra::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    if (!base_level_only) {
+        IndexHNSW::search(n, x, k, distances, labels, params);
+    } else {
+        std::vector<storage_idx_t> nearest(n);
+        std::vector<float> nearest_d(n);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            std::unique_ptr<DistanceComputer> dis(
+                    storage_distance_computer(this->storage));
+            dis->set_query(x + i * d);
+            nearest[i] = -1;
+            nearest_d[i] = std::numeric_limits<float>::max();
+
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal - 1);
+
+            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
+                auto idx = distrib(gen);
+                auto distance = (*dis)(idx);
+                if (distance < nearest_d[i]) {
+                    nearest[i] = idx;
+                    nearest_d[i] = distance;
+                }
+            }
+            FAISS_THROW_IF_NOT_MSG(
+                    nearest[i] >= 0, "Could not find a valid entrypoint.");
+        }
+
+        search_level_0(
+                n,
+                x,
+                k,
+                nearest.data(),
+                nearest_d.data(),
+                distances,
+                labels,
+                1, // n_probes
+                1, // search_type
+                params);
+    }
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexHNSW.h b/thirdparty/faiss/faiss/IndexHNSW.h
index e0b65fca9..71807c653 100644
--- a/thirdparty/faiss/faiss/IndexHNSW.h
+++ b/thirdparty/faiss/faiss/IndexHNSW.h
@@ -34,6 +34,18 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
+    // When set to false, level 0 in the knn graph is not initialized.
+    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
+    // as level 0 knn graph is copied over from the index built by
+    // GpuIndexCagra.
+    bool init_level0 = true;
+
+    // When set to true, all neighbors in level 0 are filled up
+    // to the maximum size allowed (2 * M). This option is used by
+    // IndexHHNSWCagra to create a full base layer graph that is
+    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
+    bool keep_max_size_level0 = false;
+
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
 
@@ -81,7 +93,8 @@ struct IndexHNSW : Index {
             float* distances,
             idx_t* labels,
             int nprobe = 1,
-            int search_type = 1) const;
+            int search_type = 1,
+            const SearchParameters* params = nullptr) const;
 
     /// alternative graph building
     void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
@@ -148,4 +161,33 @@ struct IndexHNSW2Level : IndexHNSW {
             const SearchParameters* params = nullptr) const override;
 };
 
+struct IndexHNSWCagra : IndexHNSW {
+    IndexHNSWCagra();
+    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+
+    /// When set to true, the index is immutable.
+    /// This option is used to copy the knn graph from GpuIndexCagra
+    /// to the base level of IndexHNSWCagra without adding upper levels.
+    /// Doing so enables to search the HNSW index, but removes the
+    /// ability to add vectors.
+    bool base_level_only = false;
+
+    /// When `base_level_only` is set to `True`, the search function
+    /// searches only the base level knn graph of the HNSW index.
+    /// This parameter selects the entry point by randomly selecting
+    /// some points and using the best one.
+    int num_base_level_search_entrypoints = 32;
+
+    void add(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp
index e3093e5fa..d93ac1481 100644
--- a/thirdparty/faiss/faiss/IndexIVFFastScan.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp
@@ -974,12 +974,6 @@ void IndexIVFFastScan::search_implem_10(
         size_t* nlist_out,
         const NormTableScaler* scaler,
         const IVFSearchParameters* params) const {
-    // const size_t nprobe = params ? params->nprobe : this->nprobe;
-    // const size_t max_codes = params ? params->max_codes : this->max_codes;
-    // const IDSelector* sel = params ? params->sel : nullptr;
-    // const SearchParameters* quantizer_params =
-    //         params ? params->quantizer_params : nullptr;
-
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
diff --git a/thirdparty/faiss/faiss/IndexNNDescent.cpp b/thirdparty/faiss/faiss/IndexNNDescent.cpp
index 27bd6e33e..382e9c41c 100644
--- a/thirdparty/faiss/faiss/IndexNNDescent.cpp
+++ b/thirdparty/faiss/faiss/IndexNNDescent.cpp
@@ -58,35 +58,6 @@ using storage_idx_t = NNDescent::storage_idx_t;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
diff --git a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
index d7719e494..efdd0bc7d 100644
--- a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
+++ b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
@@ -32,7 +32,9 @@ IndexScalarQuantizer::IndexScalarQuantizer(
         MetricType metric)
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
     is_trained = qtype == ScalarQuantizer::QT_fp16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct;
+            qtype == ScalarQuantizer::QT_8bit_direct ||
+            qtype == ScalarQuantizer::QT_bf16 ||
+            qtype == ScalarQuantizer::QT_8bit_direct_signed;
     code_size = sq.code_size;
 }
 
diff --git a/thirdparty/faiss/faiss/MetricType.h b/thirdparty/faiss/faiss/MetricType.h
index 6904fa203..067cb142a 100644
--- a/thirdparty/faiss/faiss/MetricType.h
+++ b/thirdparty/faiss/faiss/MetricType.h
@@ -39,6 +39,10 @@ enum MetricType {
     METRIC_Canberra = 20,
     METRIC_BrayCurtis = 21,
     METRIC_JensenShannon = 22,
+    /// Squared Eucliden distance, ignoring NaNs
+    METRIC_NaNEuclidean = 24,
+    /// abs(x | y): the distance to a hyperplane
+    METRIC_ABS_INNER_PRODUCT = 25,
 };
 
 /// all vector indices are this type
diff --git a/thirdparty/faiss/faiss/gpu/GpuIcmEncoder.cu b/thirdparty/faiss/faiss/gpu/GpuIcmEncoder.cu
index 434fae9e3..8bd60f91b 100644
--- a/thirdparty/faiss/faiss/gpu/GpuIcmEncoder.cu
+++ b/thirdparty/faiss/faiss/gpu/GpuIcmEncoder.cu
@@ -82,7 +82,7 @@ void GpuIcmEncoder::encode(
         size_t n,
         size_t ils_iters) const {
     size_t nshards = shards->size();
-    size_t shard_size = (n + nshards - 1) / nshards;
+    size_t base_shard_size = n / nshards;
 
     auto codebooks = lsq->codebooks.data();
     auto M = lsq->M;
@@ -94,8 +94,14 @@ void GpuIcmEncoder::encode(
 
     // split input data
     auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        size_t i0 = idx * shard_size;
-        size_t ni = std::min(shard_size, n - i0);
+        size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
+        size_t ni = base_shard_size;
+        if (ni < n % nshards) {
+            ++ni;
+        }
+        if (ni <= 0) { // only if n < nshards
+            return;
+        }
         auto xi = x + i0 * d;
         auto ci = codes + i0 * M;
         std::mt19937 geni(idx + seed); // different seed for each shard
diff --git a/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp b/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
index cebe8a1e2..e2b2791e5 100644
--- a/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
+++ b/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
     return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
 }
 
+void TimeoutCallback::set_timeout(double timeout_in_seconds) {
+    timeout = timeout_in_seconds;
+    start = std::chrono::steady_clock::now();
+}
+
+bool TimeoutCallback::want_interrupt() {
+    if (timeout == 0) {
+        return false;
+    }
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<float, std::milli> duration = end - start;
+    float elapsed_in_seconds = duration.count() / 1000.0;
+    if (elapsed_in_seconds > timeout) {
+        timeout = 0;
+        return true;
+    }
+    return false;
+}
+
+void TimeoutCallback::reset(double timeout_in_seconds) {
+    auto tc(new faiss::TimeoutCallback());
+    faiss::InterruptCallback::instance.reset(tc);
+    tc->set_timeout(timeout_in_seconds);
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/AuxIndexStructures.h b/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
index f8b5cca84..7e12a1a3a 100644
--- a/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
+++ b/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
@@ -161,6 +161,14 @@ struct FAISS_API InterruptCallback {
     static size_t get_period_hint(size_t flops);
 };
 
+struct TimeoutCallback : InterruptCallback {
+    std::chrono::time_point<std::chrono::steady_clock> start;
+    double timeout;
+    bool want_interrupt() override;
+    void set_timeout(double timeout_in_seconds);
+    static void reset(double timeout_in_seconds);
+};
+
 /// set implementation optimized for fast access.
 struct VisitedTable {
     std::vector<uint8_t> visited;
diff --git a/thirdparty/faiss/faiss/impl/DistanceComputer.h b/thirdparty/faiss/faiss/impl/DistanceComputer.h
index dc46d113f..5ac3a702c 100644
--- a/thirdparty/faiss/faiss/impl/DistanceComputer.h
+++ b/thirdparty/faiss/faiss/impl/DistanceComputer.h
@@ -59,6 +59,52 @@ struct DistanceComputer {
     virtual ~DistanceComputer() {}
 };
 
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer : DistanceComputer {
+    /// owned by this
+    DistanceComputer* basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer* basedis)
+            : basedis(basedis) {}
+
+    void set_query(const float* x) override {
+        basedis->set_query(x);
+    }
+
+    /// compute distance of vector i to current query
+    float operator()(idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) override {
+        basedis->distances_batch_4(
+                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
+        dis0 = -dis0;
+        dis1 = -dis1;
+        dis2 = -dis2;
+        dis3 = -dis3;
+    }
+
+    /// compute distance between two stored vectors
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer() {
+        delete basedis;
+    }
+};
+
 /*************************************************************
  * Specialized version of the DistanceComputer when we know that codes are
  * laid out in a flat index.
diff --git a/thirdparty/faiss/faiss/impl/HNSW.cpp b/thirdparty/faiss/faiss/impl/HNSW.cpp
index d8c822596..3ba5f72f6 100644
--- a/thirdparty/faiss/faiss/impl/HNSW.cpp
+++ b/thirdparty/faiss/faiss/impl/HNSW.cpp
@@ -7,6 +7,7 @@
 
 #include <faiss/impl/HNSW.h>
 
+#include <cstddef>
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
@@ -215,8 +216,8 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
         if (pt_level > max_level)
             max_level = pt_level;
         offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-        neighbors.resize(offsets.back(), -1);
     }
+    neighbors.resize(offsets.back(), -1);
 
     return max_level;
 }
@@ -229,7 +230,14 @@ void HNSW::shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistFarther>& input,
         std::vector<NodeDistFarther>& output,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0) {
+    // This prevents number of neighbors at
+    // level 0 from being shrunk to less than 2 * M.
+    // This is essential in making sure
+    // `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
+    std::vector<NodeDistFarther> outsiders;
+
     while (input.size() > 0) {
         NodeDistFarther v1 = input.top();
         input.pop();
@@ -250,8 +258,15 @@ void HNSW::shrink_neighbor_list(
             if (output.size() >= max_size) {
                 return;
             }
+        } else if (keep_max_size_level0) {
+            outsiders.push_back(v1);
         }
     }
+    size_t idx = 0;
+    while (keep_max_size_level0 && (output.size() < max_size) &&
+           (idx < outsiders.size())) {
+        output.push_back(outsiders[idx++]);
+    }
 }
 
 namespace {
@@ -268,7 +283,8 @@ using NodeDistFarther = HNSW::NodeDistFarther;
 void shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0 = false) {
     if (resultSet1.size() < max_size) {
         return;
     }
@@ -280,7 +296,8 @@ void shrink_neighbor_list(
         resultSet1.pop();
     }
 
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+    HNSW::shrink_neighbor_list(
+            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
 
     for (NodeDistFarther curen2 : returnlist) {
         resultSet1.emplace(curen2.d, curen2.id);
@@ -294,7 +311,8 @@ void add_link(
         DistanceComputer& qdis,
         storage_idx_t src,
         storage_idx_t dest,
-        int level) {
+        int level,
+        bool keep_max_size_level0 = false) {
     size_t begin, end;
     hnsw.neighbor_range(src, level, &begin, &end);
     if (hnsw.neighbors[end - 1] == -1) {
@@ -319,7 +337,7 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(qdis, resultSet, end - begin);
+    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -429,7 +447,8 @@ void HNSW::add_links_starting_from(
         float d_nearest,
         int level,
         omp_lock_t* locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     std::priority_queue<NodeDistCloser> link_targets;
 
     search_neighbors_to_add(
@@ -438,13 +457,13 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+    ::faiss::shrink_neighbor_list(ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
     while (!link_targets.empty()) {
         storage_idx_t other_id = link_targets.top().id;
-        add_link(*this, ptdis, pt_id, other_id, level);
+        add_link(*this, ptdis, pt_id, other_id, level, keep_max_size_level0);
         neighbors.push_back(other_id);
         link_targets.pop();
     }
@@ -452,7 +471,7 @@ void HNSW::add_links_starting_from(
     omp_unset_lock(&locks[pt_id]);
     for (storage_idx_t other_id : neighbors) {
         omp_set_lock(&locks[other_id]);
-        add_link(*this, ptdis, other_id, pt_id, level);
+        add_link(*this, ptdis, other_id, pt_id, level, keep_max_size_level0);
         omp_unset_lock(&locks[other_id]);
     }
     omp_set_lock(&locks[pt_id]);
@@ -467,7 +486,8 @@ void HNSW::add_with_locks(
         int pt_level,
         int pt_id,
         std::vector<omp_lock_t>& locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     //  greedy search on upper levels
 
     storage_idx_t nearest;
@@ -496,7 +516,14 @@ void HNSW::add_with_locks(
 
     for (; level >= 0; level--) {
         add_links_starting_from(
-                ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt);
+                ptdis,
+                pt_id,
+                nearest,
+                d_nearest,
+                level,
+                locks.data(),
+                vt,
+                keep_max_size_level0);
     }
 
     omp_unset_lock(&locks[pt_id]);
@@ -910,9 +937,12 @@ void HNSW::search_level_0(
         const float* nearest_d,
         int search_type,
         HNSWStats& search_stats,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     const HNSW& hnsw = *this;
+    auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
+
     if (search_type == 1) {
         int nres = 0;
 
@@ -925,16 +955,24 @@ void HNSW::search_level_0(
             if (vt.get(cj))
                 continue;
 
-            int candidates_size = std::max(hnsw.efSearch, k);
+            int candidates_size = std::max(efSearch, k);
             MinimaxHeap candidates(candidates_size);
 
             candidates.push(cj, nearest_d[j]);
 
             nres = search_from_candidates(
-                    hnsw, qdis, res, candidates, vt, search_stats, 0, nres);
+                    hnsw,
+                    qdis,
+                    res,
+                    candidates,
+                    vt,
+                    search_stats,
+                    0,
+                    nres,
+                    params);
         }
     } else if (search_type == 2) {
-        int candidates_size = std::max(hnsw.efSearch, int(k));
+        int candidates_size = std::max(efSearch, int(k));
         candidates_size = std::max(candidates_size, int(nprobe));
 
         MinimaxHeap candidates(candidates_size);
@@ -947,7 +985,7 @@ void HNSW::search_level_0(
         }
 
         search_from_candidates(
-                hnsw, qdis, res, candidates, vt, search_stats, 0);
+                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/impl/HNSW.h b/thirdparty/faiss/faiss/impl/HNSW.h
index 8261423cd..f3aacf8a5 100644
--- a/thirdparty/faiss/faiss/impl/HNSW.h
+++ b/thirdparty/faiss/faiss/impl/HNSW.h
@@ -184,7 +184,8 @@ struct HNSW {
             float d_nearest,
             int level,
             omp_lock_t* locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /** add point pt_id on all levels <= pt_level and build the link
      * structure for them. */
@@ -193,7 +194,8 @@ struct HNSW {
             int pt_level,
             int pt_id,
             std::vector<omp_lock_t>& locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /// search interface for 1 point, single thread
     HNSWStats search(
@@ -211,7 +213,8 @@ struct HNSW {
             const float* nearest_d,
             int search_type,
             HNSWStats& search_stats,
-            VisitedTable& vt) const;
+            VisitedTable& vt,
+            const SearchParametersHNSW* params = nullptr) const;
 
     void reset();
 
@@ -224,7 +227,8 @@ struct HNSW {
             DistanceComputer& qdis,
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
-            int max_size);
+            int max_size,
+            bool keep_max_size_level0 = false);
 
     void permute_entries(const idx_t* map);
 };
diff --git a/thirdparty/faiss/faiss/impl/NNDescent.cpp b/thirdparty/faiss/faiss/impl/NNDescent.cpp
index b609aba39..5afcdaf5b 100644
--- a/thirdparty/faiss/faiss/impl/NNDescent.cpp
+++ b/thirdparty/faiss/faiss/impl/NNDescent.cpp
@@ -154,15 +154,20 @@ NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
 NNDescent::~NNDescent() {}
 
 void NNDescent::join(DistanceComputer& qdis) {
+    idx_t check_period = InterruptCallback::get_period_hint(d * search_L);
+    for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal);
 #pragma omp parallel for default(shared) schedule(dynamic, 100)
-    for (int n = 0; n < ntotal; n++) {
-        graph[n].join([&](int i, int j) {
-            if (i != j) {
-                float dist = qdis.symmetric_dis(i, j);
-                graph[i].insert(j, dist);
-                graph[j].insert(i, dist);
-            }
-        });
+        for (idx_t n = i0; n < i1; n++) {
+            graph[n].join([&](int i, int j) {
+                if (i != j) {
+                    float dist = qdis.symmetric_dis(i, j);
+                    graph[i].insert(j, dist);
+                    graph[j].insert(i, dist);
+                }
+            });
+        }
+        InterruptCallback::check();
     }
 }
 
diff --git a/thirdparty/faiss/faiss/impl/NSG.cpp b/thirdparty/faiss/faiss/impl/NSG.cpp
index 1f30b576b..c97494334 100644
--- a/thirdparty/faiss/faiss/impl/NSG.cpp
+++ b/thirdparty/faiss/faiss/impl/NSG.cpp
@@ -25,35 +25,6 @@ namespace {
 // It needs to be smaller than 0
 constexpr int EMPTY_ID = -1;
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
index 2c81a3558..449cded8c 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
@@ -75,6 +75,7 @@ void ScalarQuantizer::set_derived_sizes() {
         case QT_8bit:
         case QT_8bit_uniform:
         case QT_8bit_direct:
+        case QT_8bit_direct_signed:
             code_size = d;
             bits = 8;
             break;
@@ -91,6 +92,10 @@ void ScalarQuantizer::set_derived_sizes() {
             code_size = d * 2;
             bits = 16;
             break;
+        case QT_bf16:
+            code_size = d * 2;
+            bits = 16;
+            break;
     }
 }
 
@@ -127,6 +132,8 @@ void ScalarQuantizer::train(size_t n, const float* x) {
             break;
         case QT_fp16:
         case QT_8bit_direct:
+        case QT_bf16:
+        case QT_8bit_direct_signed:
             // no training necessary
             break;
     }
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
index a6ac1a67c..2b4b856ad 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
@@ -31,7 +31,10 @@ struct ScalarQuantizer : Quantizer {
         QT_4bit_uniform,
         QT_fp16,
         QT_8bit_direct, ///< fast indexing of uint8s
-        QT_6bit,        ///< 6 bits per component
+        QT_6bit,        ///< 6 bits per component,
+        QT_bf16,
+        QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from
+                               ///< [-128 to 127]
     };
 
     QuantizerType qtype = QT_8bit;
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
index 220de4cef..6a20a0ca8 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
@@ -14,6 +14,7 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/impl/ScalarQuantizerOp.h>
+#include <faiss/utils/bf16.h>
 #include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
@@ -227,6 +228,37 @@ struct QuantizerFP16<1> : SQuantizer {
     }
 };
 
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16 {};
+
+template <>
+struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_bf16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_bf16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_bf16(((uint16_t*)code)[i]);
+    }
+};
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -259,6 +291,38 @@ struct Quantizer8bitDirect<1> : SQuantizer {
     }
 };
 
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct Quantizer8bitDirectSigned {};
+
+template <>
+struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& /* unused */)
+            : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            code[i] = (uint8_t)(x[i] + 128);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = code[i] - 128;
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return code[i] - 128;
+    }
+};
+
 template <int SIMDWIDTH>
 SQuantizer* select_quantizer_1(
         QuantizerType qtype,
@@ -282,8 +346,12 @@ SQuantizer* select_quantizer_1(
                     d, trained);
         case ScalarQuantizer::QT_fp16:
             return new QuantizerFP16<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_bf16:
+            return new QuantizerBF16<SIMDWIDTH>(d, trained);
         case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
 }
@@ -511,6 +579,10 @@ SQDistanceComputer* select_distance_computer(
             return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
                     d, trained);
 
+        case ScalarQuantizer::QT_bf16:
+            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
+                    d, trained);
+
         case ScalarQuantizer::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
@@ -520,6 +592,12 @@ SQDistanceComputer* select_distance_computer(
                         Sim,
                         SIMDWIDTH>(d, trained);
             }
+
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate<
+                    Quantizer8bitDirectSigned<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
     return nullptr;
@@ -613,6 +691,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                     QuantizerFP16<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_bf16:
+            return sel2_InvertedListScanner<DCTemplate<
+                    QuantizerBF16<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case ScalarQuantizer::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner<
@@ -624,6 +707,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                         Similarity,
                         SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner<DCTemplate<
+                    Quantizer8bitDirectSigned<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
     }
 
     FAISS_THROW_MSG("unknown qtype");
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
index 6bc7a62dd..fc1ad255b 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
@@ -190,6 +190,33 @@ struct QuantizerFP16_avx<8> : public QuantizerFP16<1> {
     }
 };
 
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16_avx {};
+
+template <>
+struct QuantizerBF16_avx<1> : public QuantizerBF16<1> {
+    QuantizerBF16_avx(size_t d, const std::vector<float>& unused)
+            : QuantizerBF16<1>(d, unused) {}
+};
+
+template <>
+struct QuantizerBF16_avx<8> : public QuantizerBF16<1> {
+    QuantizerBF16_avx(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
+        code_256i = _mm256_slli_epi32(code_256i, 16);
+        return _mm256_castsi256_ps(code_256i);
+    }
+};
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -216,6 +243,34 @@ struct Quantizer8bitDirect_avx<8> : public Quantizer8bitDirect<1> {
     }
 };
 
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct Quantizer8bitDirectSigned_avx {};
+
+template <>
+struct Quantizer8bitDirectSigned_avx<1> : public Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned_avx(size_t d, const std::vector<float>& unused)
+            : Quantizer8bitDirectSigned(d, unused) {}
+};
+
+template <>
+struct Quantizer8bitDirectSigned_avx<8> : public Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned_avx(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
+        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
+        __m256i c8 = _mm256_set1_epi32(128);
+        __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes
+        return _mm256_cvtepi32_ps(z8);         // 8 * float32
+    }
+};
+
 template <int SIMDWIDTH>
 SQuantizer* select_quantizer_1_avx(
         QuantizerType qtype,
@@ -239,8 +294,12 @@ SQuantizer* select_quantizer_1_avx(
                     d, trained);
         case QuantizerType::QT_fp16:
             return new QuantizerFP16_avx<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_bf16:
+            return new QuantizerBF16_avx<SIMDWIDTH>(d, trained);
         case QuantizerType::QT_8bit_direct:
             return new Quantizer8bitDirect_avx<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned_avx<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
 }
@@ -581,6 +640,12 @@ SQDistanceComputer* select_distance_computer_avx(
                     Sim,
                     SIMDWIDTH>(d, trained);
 
+        case QuantizerType::QT_bf16:
+            return new DCTemplate_avx<
+                    QuantizerBF16_avx<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
+
         case QuantizerType::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte_avx<Sim, SIMDWIDTH>(d, trained);
@@ -590,6 +655,12 @@ SQDistanceComputer* select_distance_computer_avx(
                         Sim,
                         SIMDWIDTH>(d, trained);
             }
+
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate_avx<
+                    Quantizer8bitDirectSigned_avx<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
     return nullptr;
@@ -659,6 +730,11 @@ InvertedListScanner* sel1_InvertedListScanner_avx(
                     QuantizerFP16_avx<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case QuantizerType::QT_bf16:
+            return sel2_InvertedListScanner_avx<DCTemplate_avx<
+                    QuantizerBF16_avx<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner_avx<
@@ -670,6 +746,11 @@ InvertedListScanner* sel1_InvertedListScanner_avx(
                         Similarity,
                         SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner_avx<DCTemplate_avx<
+                    Quantizer8bitDirectSigned_avx<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
     }
 
     FAISS_THROW_MSG("unknown qtype");
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
index b93ba9465..64e4c4a56 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
@@ -204,6 +204,39 @@ struct QuantizerFP16_avx512<16> : public QuantizerFP16_avx<8> {
     }
 };
 
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16_avx512 {};
+
+template <>
+struct QuantizerBF16_avx512<1> : public QuantizerBF16_avx<1> {
+    QuantizerBF16_avx512(size_t d, const std::vector<float>& unused)
+            : QuantizerBF16_avx<1>(d, unused) {}
+};
+
+template <>
+struct QuantizerBF16_avx512<8> : public QuantizerBF16_avx<8> {
+    QuantizerBF16_avx512(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16_avx<8>(d, trained) {}
+};
+
+template <>
+struct QuantizerBF16_avx512<16> : public QuantizerBF16_avx<8> {
+    QuantizerBF16_avx512(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16_avx<8>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m512
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m256i code_256i = _mm256_loadu_si256((const __m256i*)(code + 2 * i));
+        __m512i code_512i = _mm512_cvtepu16_epi32(code_256i);
+        code_512i = _mm512_slli_epi32(code_512i, 16);
+        return _mm512_castsi512_ps(code_512i);
+    }
+};
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -236,6 +269,40 @@ struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> {
     }
 };
 
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct Quantizer8bitDirectSigned_avx512 {};
+
+template <>
+struct Quantizer8bitDirectSigned_avx512<1> : public Quantizer8bitDirectSigned_avx<1> {
+    Quantizer8bitDirectSigned_avx512(size_t d, const std::vector<float>& unused)
+            : Quantizer8bitDirectSigned_avx<1>(d, unused) {}
+};
+
+template <>
+struct Quantizer8bitDirectSigned_avx512<8> : public Quantizer8bitDirectSigned_avx<8> {
+    Quantizer8bitDirectSigned_avx512(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned_avx<8>(d, trained) {}
+};
+
+template <>
+struct Quantizer8bitDirectSigned_avx512<16> : public Quantizer8bitDirectSigned_avx<8> {
+    Quantizer8bitDirectSigned_avx512(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned_avx<8>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m512
+    reconstruct_16_components(const uint8_t* code, int i) const {
+        __m256i x16 = _mm256_loadu_si256((__m256i*)(code + i)); // 16 * int8
+        __m512i y16 = _mm512_cvtepu8_epi16(x16);                // 16 * int32
+        __m512i c16 = _mm512_set1_epi32(128);
+        __m512i z16 = _mm512_sub_epi32(y16, c16); // subtract 128 from all lanes
+        return _mm512_cvtepi32_ps(z16);           // 16 * float32
+    }
+};
+
 template <int SIMDWIDTH>
 SQuantizer* select_quantizer_1_avx512(
         QuantizerType qtype,
@@ -269,8 +336,12 @@ SQuantizer* select_quantizer_1_avx512(
                     SIMDWIDTH>(d, trained);
         case QuantizerType::QT_fp16:
             return new QuantizerFP16_avx512<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_bf16:
+            return new QuantizerBF16_avx512<SIMDWIDTH>(d, trained);
         case QuantizerType::QT_8bit_direct:
             return new Quantizer8bitDirect_avx512<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned_avx512<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
 }
@@ -653,6 +724,12 @@ SQDistanceComputer* select_distance_computer_avx512(
                     Sim,
                     SIMDWIDTH>(d, trained);
 
+        case QuantizerType::QT_bf16:
+            return new DCTemplate_avx512<
+                    QuantizerBF16_avx512<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
+
         case QuantizerType::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte_avx512<Sim, SIMDWIDTH>(
@@ -663,6 +740,12 @@ SQDistanceComputer* select_distance_computer_avx512(
                         Sim,
                         SIMDWIDTH>(d, trained);
             }
+
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate_avx512<
+                    Quantizer8bitDirectSigned_avx512<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
     return nullptr;
@@ -732,6 +815,11 @@ InvertedListScanner* sel1_InvertedListScanner_avx512(
                     QuantizerFP16_avx512<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case QuantizerType::QT_bf16:
+            return sel2_InvertedListScanner_avx512<DCTemplate_avx512<
+                    QuantizerBF16_avx512<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner_avx512<
@@ -743,6 +831,11 @@ InvertedListScanner* sel1_InvertedListScanner_avx512(
                         Similarity,
                         SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner_avx512<DCTemplate_avx512<
+                    Quantizer8bitDirectSigned_avx512<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
     }
 
     FAISS_THROW_MSG("unknown qtype");
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_neon.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_neon.h
index f272784e9..25cc36503 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_neon.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_neon.h
@@ -159,6 +159,33 @@ struct QuantizerFP16_neon<8> : public QuantizerFP16<1> {
     }
 };
 
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16_neon {};
+
+template <>
+struct QuantizerBF16_neon<1> : public QuantizerBF16<1> {
+    QuantizerBF16_neon(size_t d, const std::vector<float>& unused)
+            : QuantizerBF16<1>(d, unused) {}
+};
+
+template <>
+struct QuantizerBF16_neon<8> : public QuantizerBF16<1> {
+    QuantizerBF16_neon(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
+                vreinterpretq_f32_u32(
+                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
+    }
+};
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -179,13 +206,48 @@ struct Quantizer8bitDirect_neon<8> : public Quantizer8bitDirect<1> {
 
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = code[i + j];
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8);
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        // convert uint16 -> uint32 -> fp32
+        return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))};
+    }
+};
+
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct Quantizer8bitDirectSigned_neon {};
+
+template <>
+struct Quantizer8bitDirectSigned_neon<1> : public Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned_neon(size_t d, const std::vector<float>& unused)
+            : Quantizer8bitDirectSigned(d, unused) {}
+};
+
+template <>
+struct Quantizer8bitDirectSigned_neon<8> : public Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned_neon(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        float32x4_t z8_0 = vcvtq_f32_u32(
+                vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32
+        float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1));
+
+        // subtract 128 to convert into signed numbers
+        return {vsubq_f32(z8_0, vmovq_n_f32(128.0)),
+                vsubq_f32(z8_1, vmovq_n_f32(128.0))};
     }
 };
 
@@ -212,8 +274,12 @@ SQuantizer* select_quantizer_1_neon(
                     d, trained);
         case QuantizerType::QT_fp16:
             return new QuantizerFP16_neon<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_bf16:
+            return new QuantizerBF16_neon<SIMDWIDTH>(d, trained);
         case QuantizerType::QT_8bit_direct:
             return new Quantizer8bitDirect_neon<SIMDWIDTH>(d, trained);
+        case QuantizerType::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned_neon<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
 }
@@ -556,6 +622,12 @@ SQDistanceComputer* select_distance_computer_neon(
                     Sim,
                     SIMDWIDTH>(d, trained);
 
+        case QuantizerType::QT_bf16:
+            return new DCTemplate_neon<
+                    QuantizerBF16_neon<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
+
         case QuantizerType::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte_neon<Sim, SIMDWIDTH>(d, trained);
@@ -565,6 +637,12 @@ SQDistanceComputer* select_distance_computer_neon(
                         Sim,
                         SIMDWIDTH>(d, trained);
             }
+
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate_neon<
+                    Quantizer8bitDirectSigned_neon<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
     return nullptr;
@@ -634,6 +712,11 @@ InvertedListScanner* sel1_InvertedListScanner_neon(
                     QuantizerFP16_neon<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case QuantizerType::QT_bf16:
+            return sel2_InvertedListScanner_neon<DCTemplate_neon<
+                    QuantizerBF16_neon<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner_neon<
@@ -645,6 +728,11 @@ InvertedListScanner* sel1_InvertedListScanner_neon(
                         Similarity,
                         SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner_neon<DCTemplate_neon<
+                    Quantizer8bitDirectSigned_neon<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
     }
 
     FAISS_THROW_MSG("unknown qtype");
diff --git a/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h b/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
index 0aa1535b2..d37b02244 100644
--- a/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
+++ b/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
@@ -16,6 +16,11 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/code_distance/code_distance-generic.h>
 
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+#if defined(__GNUC__) && __GNUC__ < 9
+#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
+#endif
+
 namespace {
 
 inline float horizontal_sum(const __m128 v) {
diff --git a/thirdparty/faiss/faiss/impl/index_read.cpp b/thirdparty/faiss/faiss/impl/index_read.cpp
index 165683715..8fe5ad8e4 100644
--- a/thirdparty/faiss/faiss/impl/index_read.cpp
+++ b/thirdparty/faiss/faiss/impl/index_read.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io_macros.h>
@@ -684,7 +682,11 @@ Index* read_index(IOReader* f, int io_flags) {
     Index* idx = nullptr;
     uint32_t h;
     READ1(h);
-    if (h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
+    if (h == fourcc("null")) {
+        // denotes a missing index, useful for some cases
+        return nullptr;
+    } else if (
+            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
         IndexFlat* idxf;
         if (h == fourcc("IxFI")) {
             idxf = new IndexFlatIP();
@@ -1137,7 +1139,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2")) {
+            h == fourcc("IHN2") || h == fourcc("IHNc")) {
         IndexHNSW* idxhnsw = nullptr;
         if (h == fourcc("IHNf"))
             idxhnsw = new IndexHNSWFlat();
@@ -1147,10 +1149,18 @@ Index* read_index(IOReader* f, int io_flags) {
             idxhnsw = new IndexHNSWSQ();
         if (h == fourcc("IHN2"))
             idxhnsw = new IndexHNSW2Level();
+        if (h == fourcc("IHNc"))
+            idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            READ1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
+            READ1(idx_hnsw_cagra->base_level_only);
+            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
-        idxhnsw->own_fields = true;
+        idxhnsw->own_fields = idxhnsw->storage != nullptr;
         if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
diff --git a/thirdparty/faiss/faiss/impl/index_write.cpp b/thirdparty/faiss/faiss/impl/index_write.cpp
index 21fc0bb11..d57c6edbf 100644
--- a/thirdparty/faiss/faiss/impl/index_write.cpp
+++ b/thirdparty/faiss/faiss/impl/index_write.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io.h>
@@ -556,8 +554,12 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_direct_map(&ivf->direct_map, f);
 }
 
-void write_index(const Index* idx, IOWriter* f) {
-    if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
+void write_index(const Index* idx, IOWriter* f, int io_flags) {
+    if (idx == nullptr) {
+        // eg. for a storage component of HNSW that is set to nullptr
+        uint32_t h = fourcc("null");
+        WRITE1(h);
+    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
         uint32_t h =
                 fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
                                : idxf->metric_type == METRIC_L2  ? "IxF2"
@@ -945,12 +947,24 @@ void write_index(const Index* idx, IOWriter* f) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            WRITE1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
+            WRITE1(idx_hnsw_cagra->base_level_only);
+            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         write_HNSW(&idxhnsw->hnsw, f);
-        write_index(idxhnsw->storage, f);
+        if (io_flags & IO_FLAG_SKIP_STORAGE) {
+            uint32_t n4 = fourcc("null");
+            WRITE1(n4);
+        } else {
+            write_index(idxhnsw->storage, f);
+        }
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
         uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
                 : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
@@ -1030,14 +1044,15 @@ void write_index(const Index* idx, IOWriter* f) {
     }
 }
 
-void write_index(const Index* idx, FILE* f) {
+
+void write_index(const Index* idx, FILE* f, int io_flags) {
     FileIOWriter writer(f);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
-void write_index(const Index* idx, const char* fname) {
+void write_index(const Index* idx, const char* fname, int io_flags) {
     FileIOWriter writer(fname);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
 // write index for offset-only index
diff --git a/thirdparty/faiss/faiss/index_factory.cpp b/thirdparty/faiss/faiss/index_factory.cpp
index 7416c41b0..78a810529 100644
--- a/thirdparty/faiss/faiss/index_factory.cpp
+++ b/thirdparty/faiss/faiss/index_factory.cpp
@@ -142,8 +142,12 @@ std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
         {"SQ4", ScalarQuantizer::QT_4bit},
         {"SQ6", ScalarQuantizer::QT_6bit},
         {"SQfp16", ScalarQuantizer::QT_fp16},
+        {"SQbf16", ScalarQuantizer::QT_bf16},
+        {"SQ8_direct_signed", ScalarQuantizer::QT_8bit_direct_signed},
+        {"SQ8_direct", ScalarQuantizer::QT_8bit_direct},
 };
-const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16)";
+const std::string sq_pattern =
+        "(SQ4|SQ8|SQ6|SQfp16|SQbf16|SQ8_direct_signed|SQ8_direct)";
 
 std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nfloat", AdditiveQuantizer::ST_norm_float},
diff --git a/thirdparty/faiss/faiss/index_io.h b/thirdparty/faiss/faiss/index_io.h
index 7ce6faf3a..a78b1493f 100644
--- a/thirdparty/faiss/faiss/index_io.h
+++ b/thirdparty/faiss/faiss/index_io.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // I/O code for indexes
 
 #ifndef FAISS_INDEX_IO_H
@@ -37,9 +35,12 @@ struct IOReader;
 struct IOWriter;
 struct InvertedLists;
 
-void write_index(const Index* idx, const char* fname);
-void write_index(const Index* idx, FILE* f);
-void write_index(const Index* idx, IOWriter* writer);
+/// skip the storage for graph-based indexes
+const int IO_FLAG_SKIP_STORAGE = 1;
+
+void write_index(const Index* idx, const char* fname, int io_flags = 0);
+void write_index(const Index* idx, FILE* f, int io_flags = 0);
+void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
 
 void write_index_binary(const IndexBinary* idx, const char* fname);
 void write_index_binary(const IndexBinary* idx, FILE* f);
diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
index c8501b230..acf08c55b 100644
--- a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
+++ b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/invlists/InvertedLists.h>
 
 #include <algorithm>
@@ -75,18 +73,10 @@ InvertedListsIterator::~InvertedListsIterator() {}
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size), use_iterator(false) {}
+        : nlist(nlist), code_size(code_size) {}
 
 InvertedLists::~InvertedLists() {}
 
-bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
-        const {
-    return use_iterator ? !std::unique_ptr<InvertedListsIterator>(
-                                   get_iterator(list_no, inverted_list_context))
-                                   ->is_available()
-                        : list_size(list_no) == 0;
-}
-
 idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
     const idx_t* ids = get_ids(list_no);
@@ -169,12 +159,6 @@ void InvertedLists::reset() {
     }
 }
 
-InvertedListsIterator* InvertedLists::get_iterator(
-        size_t /*list_no*/,
-        void* /*inverted_list_context*/) const {
-    FAISS_THROW_MSG("get_iterator is not supported");
-}
-
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
@@ -324,6 +308,54 @@ size_t InvertedLists::compute_ntotal() const {
     return tot;
 }
 
+bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    if (use_iterator) {
+        return !std::unique_ptr<InvertedListsIterator>(
+                        get_iterator(list_no, inverted_list_context))
+                        ->is_available();
+    } else {
+        FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+        return list_size(list_no) == 0;
+    }
+}
+
+// implemnent iterator on top of get_codes / get_ids
+namespace {
+
+struct CodeArrayIterator : InvertedListsIterator {
+    size_t list_size;
+    size_t code_size;
+    InvertedLists::ScopedCodes codes;
+    InvertedLists::ScopedIds ids;
+    size_t idx = 0;
+
+    CodeArrayIterator(const InvertedLists* il, size_t list_no)
+            : list_size(il->list_size(list_no)),
+              code_size(il->code_size),
+              codes(il, list_no),
+              ids(il, list_no) {}
+
+    bool is_available() const override {
+        return idx < list_size;
+    }
+    void next() override {
+        idx++;
+    }
+    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+        return {ids[idx], codes.get() + code_size * idx};
+    }
+};
+
+} // namespace
+
+InvertedListsIterator* InvertedLists::get_iterator(
+        size_t list_no,
+        void* inverted_list_context) const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return new CodeArrayIterator(this, list_no);
+}
+
 /*****************************************
  * ArrayInvertedLists implementation
  ******************************************/
@@ -366,6 +398,12 @@ size_t ArrayInvertedLists::list_size(size_t list_no) const {
     return ids[list_no].size();
 }
 
+bool ArrayInvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return ids[list_no].size() == 0;
+}
+
 const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
     assert(list_no < nlist);
     return codes[list_no].data();
diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.h b/thirdparty/faiss/faiss/invlists/InvertedLists.h
index 951df3376..bd4220017 100644
--- a/thirdparty/faiss/faiss/invlists/InvertedLists.h
+++ b/thirdparty/faiss/faiss/invlists/InvertedLists.h
@@ -67,7 +67,9 @@ struct InvertedListsIterator {
 struct InvertedLists {
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
-    bool use_iterator;
+
+    /// request to use iterator rather than get_codes / get_ids
+    bool use_iterator = false;
 
     InvertedLists(size_t nlist, size_t code_size);
 
@@ -80,9 +82,6 @@ struct InvertedLists {
     /*************************
      *  Read only functions */
 
-    // check if the list is empty
-    bool is_empty(size_t list_no, void* inverted_list_context) const;
-
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
@@ -95,11 +94,6 @@ struct InvertedLists {
     // get the segment minimal number of a list (continuous storage can be regarded as 1-segment storage)
     virtual size_t get_segment_offset(size_t list_no, size_t segment_no) const;
 
-    /// get iterable for lists that use_iterator
-    virtual InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context) const;
-
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -154,6 +148,18 @@ struct InvertedLists {
     /// a list can be -1 hence the signed long
     virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
 
+    /*****************************************
+     * Iterator interface (with context)     */
+
+    /// check if the list is empty
+    virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const;
+
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const;
+
     /*************************
      * writing functions     */
 
@@ -372,6 +378,9 @@ struct ArrayInvertedLists : InvertedLists {
     /// permute the inverted lists, map maps new_id to old_id
     void permute_invlists(const idx_t* map);
 
+    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const override;
+
     ~ArrayInvertedLists() override;
 };
 
diff --git a/thirdparty/faiss/faiss/utils/bf16.h b/thirdparty/faiss/faiss/utils/bf16.h
new file mode 100644
index 000000000..ff0fbe898
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/bf16.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+
+namespace {
+
+union fp32_bits {
+    uint32_t as_u32;
+    float as_f32;
+};
+
+} // namespace
+
+inline uint16_t encode_bf16(const float f) {
+    // Round off
+    fp32_bits fp;
+    fp.as_f32 = f;
+    return static_cast<uint16_t>((fp.as_u32 + 0x8000) >> 16);
+}
+
+inline float decode_bf16(const uint16_t v) {
+    fp32_bits fp;
+    fp.as_u32 = (uint32_t(v) << 16);
+    return fp.as_f32;
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/extra_distances-inl.h b/thirdparty/faiss/faiss/utils/extra_distances-inl.h
index 4df72b0d7..25ce3643c 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances-inl.h
+++ b/thirdparty/faiss/faiss/utils/extra_distances-inl.h
@@ -8,6 +8,7 @@
 /** In this file are the implementations of extra metrics beyond L2
  *  and inner product */
 
+#include <cmath>
 #include <type_traits>
 
 #include <faiss/FaissHook.h>
@@ -135,4 +136,35 @@ inline float VectorDistance<METRIC_Jaccard>::operator()(
     return accu_num / accu_den;
 }
 
+template <>
+inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
+        const float* x,
+        const float* y) const {
+    // https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
+    float accu = 0;
+    size_t present = 0;
+    for (size_t i = 0; i < d; i++) {
+        if (!std::isnan(x[i]) && !std::isnan(y[i])) {
+            float diff = x[i] - y[i];
+            accu += diff * diff;
+            present++;
+        }
+    }
+    if (present == 0) {
+        return NAN;
+    }
+    return float(d) / float(present) * accu;
+}
+
+template <>
+inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu += fabs(x[i] * y[i]);
+    }
+    return accu;
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/extra_distances.cpp b/thirdparty/faiss/faiss/utils/extra_distances.cpp
index 520ed6737..0403ec82c 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances.cpp
+++ b/thirdparty/faiss/faiss/utils/extra_distances.cpp
@@ -51,17 +51,19 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD, class C>
+template <class VD>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        HeapArray<C>* res,
+        size_t k,
+        float* distances,
+        int64_t* labels,
         const IDSelector* sel = nullptr) {
-    size_t k = res->k;
     size_t d = vd.d;
+    using C = typename VD::C;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
     check_period *= omp_get_max_threads();
 
@@ -73,8 +75,8 @@ void knn_extra_metrics_template(
             const float* x_i = x + i * d;
             const float* y_j = y;
             size_t j;
-            float* simi = res->get_val(i);
-            int64_t* idxi = res->get_ids(i);
+            float* simi = distances + k * i;
+            int64_t* idxi = labels + k * i;
 
             // maxheap_heapify(k, simi, idxi);
             heap_heapify<C>(k, simi, idxi);
@@ -82,10 +84,7 @@ void knn_extra_metrics_template(
                 if (!sel || sel->is_member(j)) {
                     float disij = vd(x_i, y_j);
 
-                    // if (disij < simi[0]) {
-                    if ((!vd.is_similarity && (disij < simi[0])) ||
-                        (vd.is_similarity && (disij > simi[0]))) {
-                        // maxheap_replace_top(k, simi, idxi, disij, j);
+                    if (C::cmp(simi[0], disij)) {
                         heap_replace_top<C>(k, simi, idxi, disij, j);
                     }
                 }
@@ -168,13 +167,14 @@ void pairwise_extra_distances(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -183,14 +183,16 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res,
+        size_t k,
+        float* distances,
+        int64_t* indexes,
         const IDSelector* sel) {
     switch (mt) {
-#define HANDLE_VAR(kw)                                            \
-    case METRIC_##kw: {                                           \
-        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res, sel);   \
-        break;                                                    \
+#define HANDLE_VAR(kw)                                                              \
+    case METRIC_##kw: {                                                             \
+        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg};                   \
+        knn_extra_metrics_template(vd, x, y, nx, ny, k, distances, indexes, sel);   \
+        break;                                                                      \
     }
         HANDLE_VAR(L2);
         HANDLE_VAR(L1);
@@ -200,34 +202,14 @@ void knn_extra_metrics(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template void knn_extra_metrics<CMax<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMax<float, int64_t>>* res,
-        const IDSelector* sel = nullptr);
-
-template void knn_extra_metrics<CMin<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMin<float, int64_t>>* res,
-        const IDSelector* sel = nullptr);
-
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -249,6 +231,8 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/thirdparty/faiss/faiss/utils/extra_distances.h b/thirdparty/faiss/faiss/utils/extra_distances.h
index 800b85a92..d786279a3 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances.h
+++ b/thirdparty/faiss/faiss/utils/extra_distances.h
@@ -34,7 +34,6 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -43,7 +42,9 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res,
+        size_t k,
+        float* distances,
+        int64_t* indexes,
         const IDSelector* sel = nullptr);
 
 /** get a DistanceComputer that refers to this type of distance and
diff --git a/thirdparty/faiss/faiss/utils/simdlib_neon.h b/thirdparty/faiss/faiss/utils/simdlib_neon.h
index 439a5210b..1bdf0ed01 100644
--- a/thirdparty/faiss/faiss/utils/simdlib_neon.h
+++ b/thirdparty/faiss/faiss/utils/simdlib_neon.h
@@ -170,14 +170,10 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     for (size_t i = 0; i < N; ++i) {
         int bytesWritten =
                 snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
-        if (bytesWritten >= 0) {
-            ptr += bytesWritten;
-        } else {
-            break;
-        }
+        ptr += bytesWritten;
     }
-    // strip last ,
-
+    // The format usually contains a ',' separator so this is to remove the last
+    // separator.
     ptr[-1] = 0;
     return std::string(res);
 }
diff --git a/thirdparty/faiss/tests/CMakeLists.txt b/thirdparty/faiss/tests/CMakeLists.txt
index 14103c27c..4d1baf1c9 100644
--- a/thirdparty/faiss/tests/CMakeLists.txt
+++ b/thirdparty/faiss/tests/CMakeLists.txt
@@ -75,6 +75,8 @@ set(FAISS_TEST_SRC
   test_distances_if.cpp
   test_fastscan_perf.cpp
   test_disable_pq_sdc_tables.cpp
+  test_common_ivf_empty_index.cpp
+  test_callback.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff --git a/thirdparty/faiss/tests/common_faiss_tests.py b/thirdparty/faiss/tests/common_faiss_tests.py
index 8dc25edec..a8afe344e 100644
--- a/thirdparty/faiss/tests/common_faiss_tests.py
+++ b/thirdparty/faiss/tests/common_faiss_tests.py
@@ -49,7 +49,6 @@ def evalres(self, DI):
         for rank in 1, 10, 100:
             e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
                        float(self.nq))
-        # print("1-recalls: %s" % e)
         return e
 
 
diff --git a/thirdparty/faiss/tests/test_binary_hashindex.py b/thirdparty/faiss/tests/test_binary_hashindex.py
index 2d3305057..e9a6eaca4 100644
--- a/thirdparty/faiss/tests/test_binary_hashindex.py
+++ b/thirdparty/faiss/tests/test_binary_hashindex.py
@@ -58,8 +58,6 @@ def test_hash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         index = faiss.IndexBinaryHash(d, 10)
         index.add(xb)
         # index.display()
@@ -80,8 +78,6 @@ def test_hash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -100,8 +96,6 @@ def test_multihash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         nfound = []
         ndis = []
 
@@ -123,8 +117,6 @@ def test_multihash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         # self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -163,7 +155,6 @@ def test_hash_and_multihash(self):
                     # no duplicates
                     self.assertTrue(len(new) == len(snew))
                     nf += len(set(ref) & snew)
-                print('nfound', nh, nbit, nf)
                 nfound[(nh, nbit)] = nf
             self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
 
@@ -175,7 +166,6 @@ def test_hash_and_multihash(self):
             np.testing.assert_array_equal(Inew, I2)
             np.testing.assert_array_equal(Dnew, D2)
 
-        print('nfound=', nfound)
         self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
         self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
         self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/thirdparty/faiss/tests/test_build_blocks.py b/thirdparty/faiss/tests/test_build_blocks.py
index 0a97e6318..fdf9ad8bd 100644
--- a/thirdparty/faiss/tests/test_build_blocks.py
+++ b/thirdparty/faiss/tests/test_build_blocks.py
@@ -189,7 +189,6 @@ def test_l2(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = ((x - y) ** 2).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -204,7 +203,6 @@ def test_IP(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = (x * y).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -220,7 +218,6 @@ def test_0s(self):
         m = rs.rand(40, 20).astype('float32')
         m[5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'has 5 copies' in comments
         assert '5 null vectors' in comments
 
@@ -229,7 +226,6 @@ def test_copies(self):
         m = rs.rand(40, 20).astype('float32')
         m[::2] = m[1::2]
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '20 vectors are distinct' in comments
 
     def test_dead_dims(self):
@@ -237,7 +233,6 @@ def test_dead_dims(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are constant' in comments
 
     def test_rogue_means(self):
@@ -245,7 +240,6 @@ def test_rogue_means(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] += 12345
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are too large wrt. their variance' in comments
 
     def test_normalized(self):
@@ -253,7 +247,6 @@ def test_normalized(self):
         m = rs.rand(40, 20).astype('float32')
         faiss.normalize_L2(m)
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'vectors are normalized' in comments
 
     def test_hash(self):
@@ -300,7 +293,6 @@ def test_8bit_equiv(self):
                 D, I = index.search(x[3:], 1)
 
                 # assert D[0, 0] == Dref[0, 0]
-                # print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
                 assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
 
     def test_6bit_equiv(self):
@@ -314,8 +306,6 @@ def test_6bit_equiv(self):
                 d, faiss.ScalarQuantizer.QT_6bit)
             index.train(trainset)
 
-            print('cs=', index.code_size)
-
             x = rs.randint(64, size=(100, d)).astype('float32')
 
             # verify encoder / decoder
@@ -330,7 +320,6 @@ def test_6bit_equiv(self):
             for i in range(20):
                 for j in range(10):
                     dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
-                    # print(dis, D[i, j])
                     assert abs(D[i, j] - dis) / dis < 1e-5
 
     def test_reconstruct(self):
@@ -371,7 +360,6 @@ def test_randint(self):
         x = faiss.randint(20000, vmax=100)
         assert np.all(x >= 0) and np.all(x < 100)
         c = np.bincount(x, minlength=100)
-        print(c)
         assert c.max() - c.min() < 50 * 2
 
     def test_rand_vector(self):
@@ -473,7 +461,6 @@ def do_test_array_type(self, dtype):
         """ tests swig_ptr and rev_swig_ptr for this type of array """
         a = np.arange(12).astype(dtype)
         ptr = faiss.swig_ptr(a)
-        print(ptr)
         a2 = faiss.rev_swig_ptr(ptr, 12)
         np.testing.assert_array_equal(a, a2)
 
@@ -547,7 +534,6 @@ def subtest(self, d, K, metric):
                         recalls += 1
                         break
         recall = 1.0 * recalls / (nb * K)
-        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
         assert recall > 0.99
 
     def test_small_nndescent(self):
@@ -656,7 +642,6 @@ def do_test_bucket_sort_inplace(
             rows, _ = np.where(tab == b)
             rows.sort()
             tab2[lims[b]:lims[b + 1]].sort()
-            # print(rows, tab2[lims[b] : lims[b + 1]])
             rows = set(rows)
             self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
 
diff --git a/thirdparty/faiss/tests/test_callback.cpp b/thirdparty/faiss/tests/test_callback.cpp
new file mode 100644
index 000000000..cdfadf1d3
--- /dev/null
+++ b/thirdparty/faiss/tests/test_callback.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/random.h>
+
+TEST(TestCallback, timeout) {
+    int n = 1000;
+    int k = 100;
+    int d = 128;
+    int niter = 1000000000;
+    int seed = 42;
+
+    std::vector<float> vecs(n * d);
+    faiss::float_rand(vecs.data(), vecs.size(), seed);
+
+    auto index(new faiss::IndexFlat(d));
+
+    faiss::ClusteringParameters cp;
+    cp.niter = niter;
+    cp.verbose = false;
+
+    faiss::Clustering kmeans(d, k, cp);
+
+    faiss::TimeoutCallback::reset(0.010);
+    EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
+    delete index;
+}
diff --git a/thirdparty/faiss/tests/test_callback_py.py b/thirdparty/faiss/tests/test_callback_py.py
new file mode 100644
index 000000000..0ec176dd8
--- /dev/null
+++ b/thirdparty/faiss/tests/test_callback_py.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import numpy as np
+import faiss
+
+
+class TestCallbackPy(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+    def test_timeout(self) -> None:
+        n = 1000
+        k = 100
+        d = 128
+        niter = 1_000_000_000
+
+        x = np.random.rand(n, d).astype('float32')
+        index = faiss.IndexFlat(d)
+
+        cp = faiss.ClusteringParameters()
+        cp.niter = niter
+        cp.verbose = False
+
+        kmeans = faiss.Clustering(d, k, cp)
+
+        with self.assertRaises(RuntimeError):
+            with faiss.TimeoutGuard(0.010):
+                kmeans.train(x, index)
diff --git a/thirdparty/faiss/tests/test_clustering.py b/thirdparty/faiss/tests/test_clustering.py
index 2b81fc3e3..b1afc8523 100644
--- a/thirdparty/faiss/tests/test_clustering.py
+++ b/thirdparty/faiss/tests/test_clustering.py
@@ -110,9 +110,6 @@ def test_weighted(self):
         cdis2_first = cdis2[:5].sum()
         cdis2_last = cdis2[5:].sum()
 
-        print(cdis1_first, cdis1_last)
-        print(cdis2_first, cdis2_last)
-
         # with the new clustering, the last should be much (*2) closer
         # to their centroids
         self.assertGreater(cdis1_last, cdis1_first * 2)
diff --git a/thirdparty/faiss/tests/test_common_ivf_empty_index.cpp b/thirdparty/faiss/tests/test_common_ivf_empty_index.cpp
new file mode 100644
index 000000000..a3e33031b
--- /dev/null
+++ b/thirdparty/faiss/tests/test_common_ivf_empty_index.cpp
@@ -0,0 +1,144 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <gtest/gtest.h>
+
+#include <omp.h>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_factory.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/random.h>
+
+/* This demonstrates how to query several independent IVF indexes with a trained
+ *index in common. This avoids to duplicate the coarse quantizer and metadata
+ *in memory.
+ **/
+
+namespace {
+
+int d = 64;
+
+} // namespace
+
+std::vector<float> get_random_vectors(size_t n, int seed) {
+    std::vector<float> x(n * d);
+    faiss::rand_smooth_vectors(n, d, x.data(), seed);
+    seed++;
+    return x;
+}
+
+/** InvetedLists implementation that dispatches the search to an InvertedList
+ * object that is passed in at query time */
+
+struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
+    DispatchingInvertedLists(size_t nlist, size_t code_size)
+            : faiss::ReadOnlyInvertedLists(nlist, code_size) {
+        use_iterator = true;
+    }
+
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const override {
+        assert(inverted_list_context);
+        auto il =
+                static_cast<const faiss::InvertedLists*>(inverted_list_context);
+        return il->get_iterator(list_no);
+    }
+
+    using idx_t = faiss::idx_t;
+
+    size_t list_size(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const uint8_t* get_codes(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const idx_t* get_ids(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+};
+
+TEST(COMMON, test_common_trained_index) {
+    int N = 3;    // number of independent indexes
+    int nt = 500; // training vectors
+    int nb = 200; // nb database vectors per index
+    int nq = 10;  // nb queries performed on each index
+    int k = 4;    // restults requested per query
+
+    // construct and build an "empty index": a trained index that does not
+    // itself hold any data
+    std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
+            faiss::index_factory(d, "IVF32,PQ8np")));
+    auto xt = get_random_vectors(nt, 123);
+    empty_index->train(nt, xt.data());
+    empty_index->nprobe = 4;
+
+    // reference run: build one index for each set of db / queries and record
+    // results
+    std::vector<std::vector<faiss::idx_t>> ref_I(N);
+
+    for (int i = 0; i < N; i++) {
+        // clone the empty index
+        std::unique_ptr<faiss::Index> index(
+                faiss::clone_index(empty_index.get()));
+        auto xb = get_random_vectors(nb, 1234 + i);
+        auto xq = get_random_vectors(nq, 12345 + i);
+        // add vectors and perform a search
+        index->add(nb, xb.data());
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+        index->search(nq, xq.data(), k, D.data(), I.data());
+        // record result as reference
+        ref_I[i] = I;
+    }
+
+    // build a set of inverted lists for each independent index
+    std::vector<faiss::ArrayInvertedLists> sub_invlists;
+
+    for (int i = 0; i < N; i++) {
+        // swap in other inverted lists
+        sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
+        faiss::InvertedLists* invlists = &sub_invlists.back();
+
+        // replace_invlists swaps in a new InvertedLists for an existing index
+        empty_index->replace_invlists(invlists, false);
+        empty_index->reset(); // reset id counter to 0
+        // populate inverted lists
+        auto xb = get_random_vectors(nb, 1234 + i);
+        empty_index->add(nb, xb.data());
+    }
+
+    // perform search dispatching to the sub-invlists. At search time, we don't
+    // use replace_invlists because that would wreak havoc in a multithreaded
+    // context
+    DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
+    empty_index->replace_invlists(&di, false);
+
+    std::vector<std::vector<faiss::idx_t>> new_I(N);
+
+    // run searches in the independent indexes but with a common empty_index
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+        auto xq = get_random_vectors(nq, 12345 + i);
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+
+        // here we set to what sub-index the queries should be directed
+        faiss::SearchParametersIVF params;
+        params.nprobe = empty_index->nprobe;
+        params.inverted_list_context = &sub_invlists[i];
+
+        empty_index->search(nq, xq.data(), k, D.data(), I.data(), &params);
+        new_I[i] = I;
+    }
+
+    // compare with reference reslt
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(ref_I[i], new_I[i]);
+    }
+}
diff --git a/thirdparty/faiss/tests/test_contrib.py b/thirdparty/faiss/tests/test_contrib.py
index 0e7cbbfb0..05a2c4ac8 100644
--- a/thirdparty/faiss/tests/test_contrib.py
+++ b/thirdparty/faiss/tests/test_contrib.py
@@ -147,7 +147,6 @@ def test_query_iterator(self, metric=faiss.METRIC_L2):
         xb = ds.get_database()
         D, I = faiss.knn(xq, xb, 10, metric=metric)
         threshold = float(D[:, -1].mean())
-        print(threshold)
 
         index = faiss.IndexFlat(32, metric)
         index.add(xb)
@@ -251,7 +250,6 @@ def test_precision_recall(self):
         Inew = np.hstack(Inew)
 
         precision, recall = evaluation.range_PR(lims_ref, Iref, lims_new, Inew)
-        print(precision, recall)
 
         self.assertEqual(precision, 0.6)
         self.assertEqual(recall, 0.6)
diff --git a/thirdparty/faiss/tests/test_contrib_with_scipy.py b/thirdparty/faiss/tests/test_contrib_with_scipy.py
index cb81bb623..4f89e2fc1 100644
--- a/thirdparty/faiss/tests/test_contrib_with_scipy.py
+++ b/thirdparty/faiss/tests/test_contrib_with_scipy.py
@@ -44,7 +44,6 @@ def test_sparse_routines(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         centroids = ds.get_queries()
@@ -72,7 +71,6 @@ def test_sparse_kmeans(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         km = faiss.Kmeans(ds.d, 50)
diff --git a/thirdparty/faiss/tests/test_extra_distances.py b/thirdparty/faiss/tests/test_extra_distances.py
index a474dd6ba..fcaf4d383 100644
--- a/thirdparty/faiss/tests/test_extra_distances.py
+++ b/thirdparty/faiss/tests/test_extra_distances.py
@@ -94,6 +94,33 @@ def test_jaccard(self):
         new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
         self.assertTrue(np.allclose(ref_dis, new_dis))
 
+    def test_nan_euclidean(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.sqeuclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+        x = [[3, np.nan, np.nan, 6]]
+        q = [[1, np.nan, np.nan, 5]]
+        dis = [(4 / 2 * ((3 - 1)**2 + (6 - 5)**2))]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(new_dis, dis))
+
+        x = [[np.nan] * 4]
+        q = [[np.nan] * 4]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.isnan(new_dis[0]))
+
+    def test_abs_inner_product(self):
+        xq, yb = self.make_example()
+        dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_ABS_INNER_PRODUCT)
+
+        gt_dis = np.abs(xq @ yb.T)
+        np.testing.assert_allclose(dis, gt_dis, atol=1e-5)
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/thirdparty/faiss/tests/test_fast_scan.py b/thirdparty/faiss/tests/test_fast_scan.py
index b061ee3af..cfe9636fe 100644
--- a/thirdparty/faiss/tests/test_fast_scan.py
+++ b/thirdparty/faiss/tests/test_fast_scan.py
@@ -34,7 +34,6 @@ def test_PQ4_accuracy(self):
         nq = Iref.shape[0]
         recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq
         assert recall_at_1 > 0.6
-        # print(f'recall@1 = {recall_at_1:.3f}')
 
 
     # This is an experiment to see if we can catch performance
@@ -498,7 +497,6 @@ def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall = (Ia == gt).sum() / nq
 
-        print(aq, st, implem, metric_type, recall_ref, recall)
         assert abs(recall_ref - recall) < 0.05
 
     def xx_test_accuracy(self):
@@ -531,7 +529,6 @@ def subtest_from_idxaq(self, implem, metric):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_from_idxaq(self):
diff --git a/thirdparty/faiss/tests/test_graph_based.py b/thirdparty/faiss/tests/test_graph_based.py
index dd4212d71..c769e03ad 100644
--- a/thirdparty/faiss/tests/test_graph_based.py
+++ b/thirdparty/faiss/tests/test_graph_based.py
@@ -133,6 +133,42 @@ def test_ndis_stats(self):
         Dhnsw, Ihnsw = index.search(self.xq, 1)
         self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
 
+    def test_io_no_storage(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+
+        Dref, Iref = index.search(self.xq, 5)
+
+        # test writing without storage
+        index2 = faiss.deserialize_index(
+            faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
+        )
+        self.assertEqual(index2.storage, None)
+        self.assertRaises(
+            RuntimeError,
+            index2.search, self.xb, 1)
+
+        # make sure we can store an index with empty storage
+        index4 = faiss.deserialize_index(
+            faiss.serialize_index(index2))
+
+        # add storage afterwards
+        index.storage = faiss.clone_index(index.storage)
+        index.own_fields = True
+
+        Dnew, Inew = index.search(self.xq, 5)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+        if False:
+            # test reading without storage
+            # not implemented because it is hard to skip over an index
+            index3 = faiss.deserialize_index(
+                faiss.serialize_index(index), faiss.IO_FLAG_SKIP_STORAGE
+            )
+            self.assertEquals(index3.storage, None)
+
 
 class TestNSG(unittest.TestCase):
 
@@ -209,7 +245,6 @@ def subtest_add(self, build_type, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
         self.subtest_io_and_clone(index, Dnsg, Insg)
@@ -230,7 +265,6 @@ def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -286,7 +320,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -294,7 +327,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -335,7 +367,6 @@ def test_nsg_pq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGPQ", recalls)
         self.assertGreaterEqual(recalls, 190)  # 193
 
         # test I/O
@@ -361,7 +392,6 @@ def test_nsg_sq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGSQ", recalls)
         self.assertGreaterEqual(recalls, 405)  # 411
 
         # test I/O
@@ -395,7 +425,6 @@ def test_nndescentflat(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNNDescentFlat", recalls)
         self.assertGreaterEqual(recalls, 450)  # 462
 
         # do some IO tests
diff --git a/thirdparty/faiss/tests/test_index.py b/thirdparty/faiss/tests/test_index.py
index f46c6a94b..43db906e4 100644
--- a/thirdparty/faiss/tests/test_index.py
+++ b/thirdparty/faiss/tests/test_index.py
@@ -327,7 +327,7 @@ def test_4variants_ivf(self):
         D, I = index.search(xq, 10)
         nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
                                                   qtype, faiss.METRIC_L2)
@@ -338,7 +338,6 @@ def test_4variants_ivf(self):
             D, I = index.search(xq, 10)
 
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
-        print(nok, nq)
 
         self.assertGreaterEqual(nok['flat'], nq * 0.6)
         # The tests below are a bit fragile, it happens that the
@@ -350,6 +349,7 @@ def test_4variants_ivf(self):
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nok['QT_8bit'])
 
     def test_4variants(self):
         d = 32
@@ -365,7 +365,7 @@ def test_4variants(self):
 
         nok = {}
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
             index.train(xt)
@@ -373,13 +373,12 @@ def test_4variants(self):
             D, I = index.search(xq, 10)
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        print(nok, nq)
-
         self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nq * 0.9)
 
 
 class TestRangeSearch(unittest.TestCase):
@@ -442,7 +441,6 @@ def norm1(x):
 
         recons_err = np.mean(norm1(R_flat - xb[I_flat]))
 
-        print('Reconstruction error = %.3f' % recons_err)
         if eps is not None:
             self.assertLessEqual(recons_err, eps)
 
@@ -638,7 +636,6 @@ def test_reconstuct_after_add(self):
 
         # should not raise an exception
         index.reconstruct(5)
-        print(index.ntotal)
         index.reconstruct(150)
 
 
diff --git a/thirdparty/faiss/tests/test_index_accuracy.py b/thirdparty/faiss/tests/test_index_accuracy.py
index 3f7bfbd30..2c5cf7b90 100644
--- a/thirdparty/faiss/tests/test_index_accuracy.py
+++ b/thirdparty/faiss/tests/test_index_accuracy.py
@@ -56,7 +56,6 @@ def test_ivf_kmeans(self):
         Dref, Iref = ivfk.search(ev.xq, 100)
         ivfk.parallel_mode = 1
         Dnew, Inew = ivfk.search(ev.xq, 100)
-        print((Iref != Inew).sum(), Iref.size)
         assert (Iref != Inew).sum() < Iref.size / 5000.0
         assert np.all(Dref == Dnew)
 
@@ -136,8 +135,6 @@ def test_polysemous(self):
 
         res = ev.launch("Polysemous ht=%d" % index.polysemous_ht, index)
         e_polysemous = ev.evalres(res)
-        print(e_baseline, e_polysemous, index.polysemous_ht)
-        print(stats.n_hamming_pass, stats.ncode)
         # The randu dataset is difficult, so we are not too picky on
         # the results. Here we assert that we have < 10 % loss when
         # computing full PQ on fewer than 20% of the data.
@@ -248,7 +245,6 @@ def subtest(self, mt):
             index.nprobe = 4  # hopefully more robust than 1
             D, I = index.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, repr(qname), ninter))
             assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
 
             if qname == "6bit":
@@ -264,7 +260,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            # print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -278,14 +273,11 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            # print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.01
 
             for pm in 1, 2:
-                # print("parallel_mode=%d" % pm)
                 index.parallel_mode = pm
                 lims4, D4, I4 = index.range_search(xq, radius)
-                # print("sizes", lims4[1:] - lims4[:-1])
                 for qno in range(len(lims) - 1):
                     Iref = I3[lims[qno]: lims[qno + 1]]
                     Inew = I4[lims4[qno]: lims4[qno + 1]]
@@ -320,7 +312,7 @@ def test_parallel_mode(self):
 
 
 class TestSQByte(unittest.TestCase):
-    def subtest_8bit_direct(self, metric_type, d):
+    def subtest_8bit_direct(self, metric_type, d, quantizer_type):
         xt, xb, xq = get_dataset_2(d, 500, 1000, 30)
 
         # rescale everything to get integer
@@ -332,16 +324,28 @@ def rescale(x):
             x[x > 255] = 255
             return x
 
-        xt = rescale(xt)
-        xb = rescale(xb)
-        xq = rescale(xq)
+        def rescale_signed(x):
+            x = np.floor((x - tmin) * 256 / (tmax - tmin))
+            x[x < 0] = 0
+            x[x > 255] = 255
+            x -= 128
+            return x
+
+        if quantizer_type == faiss.ScalarQuantizer.QT_8bit_direct_signed:
+            xt = rescale_signed(xt)
+            xb = rescale_signed(xb)
+            xq = rescale_signed(xq)
+        else:
+            xt = rescale(xt)
+            xb = rescale(xb)
+            xq = rescale(xq)
 
         gt_index = faiss.IndexFlat(d, metric_type)
         gt_index.add(xb)
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexScalarQuantizer(
-            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type
+            d, quantizer_type, metric_type
         )
         index.add(xb)
         D, I = index.search(xq, 10)
@@ -361,7 +365,7 @@ def rescale(x):
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexIVFScalarQuantizer(
-            quantizer, d, nlist, faiss.ScalarQuantizer.QT_8bit_direct,
+            quantizer, d, nlist, quantizer_type,
             metric_type
         )
         index.nprobe = 4
@@ -374,9 +378,10 @@ def rescale(x):
         assert np.all(D == Dref)
 
     def test_8bit_direct(self):
-        for d in 13, 16, 24:
-            for metric_type in faiss.METRIC_L2, faiss.METRIC_INNER_PRODUCT:
-                self.subtest_8bit_direct(metric_type, d)
+        for quantizer in faiss.ScalarQuantizer.QT_8bit_direct, faiss.ScalarQuantizer.QT_8bit_direct_signed:
+            for d in 13, 16, 24:
+                for metric_type in faiss.METRIC_L2, faiss.METRIC_INNER_PRODUCT:
+                    self.subtest_8bit_direct(metric_type, d, quantizer)
 
 
 class TestNNDescent(unittest.TestCase):
@@ -485,7 +490,6 @@ def subtest(self, mt):
             D, I = index.search(xq, 10)
 
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, by_residual, ninter))
 
             assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
 
@@ -499,10 +503,6 @@ def subtest(self, mt):
                 index.polysemous_ht = 20
                 D, I = index.search(xq, 10)
                 ninter = faiss.eval_intersection(I, gt_I)
-                print(
-                    "(%d, %s, %d): %d, "
-                    % (mt, by_residual, index.polysemous_ht, ninter)
-                )
 
                 # polysemous behaves bizarrely on ARM
                 assert (
@@ -516,7 +516,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -530,7 +529,6 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
     def test_IVFPQ_non8bit(self):
@@ -555,7 +553,6 @@ def test_IVFPQ_non8bit(self):
 
             D, I = index.search(xq, 10)
             ninter[v] = faiss.eval_intersection(I, gt_I)
-        print("ninter=", ninter)
         # this should be the case but we don't observe
         # that... Probavly too few test points
         #  assert ninter['2x8'] > ninter['8x2']
@@ -623,9 +620,6 @@ def test_OPQ(self):
         res = ev.launch("OPQ", index)
         e_opq = ev.evalres(res)
 
-        print("e_pq=%s" % e_pq)
-        print("e_opq=%s" % e_opq)
-
         # verify that OPQ better than PQ
         for r in 1, 10, 100:
             assert e_opq[r] > e_pq[r]
@@ -656,7 +650,6 @@ def test_OIVFPQ(self):
 
         # verify same on OIVFPQ
         for r in 1, 10, 100:
-            print(e_oivfpq[r], e_ivfpq[r])
             assert e_oivfpq[r] >= e_ivfpq[r]
 
 
@@ -758,9 +751,6 @@ def test_sh(self):
                     ninter = faiss.eval_intersection(I, gt_I)
                     key = (nbit, tt, period)
 
-                    print("(%d, %s, %g): %d, " % (nbit, repr(tt), period,
-                                                  ninter))
-                    print(abs(ninter - self.ref_results[key]))
                     assert abs(ninter - self.ref_results[key]) <= 14
 
 
@@ -799,7 +789,6 @@ def do_test(self, metric):
         # check that with refinement, the recall@10 is the same as
         # the original recall@100
         recall2 = (I2 == Iref[:, :1]).sum()
-        # print("recalls", recall1, recall2)
         self.assertEqual(recall1, recall2)
 
     def test_IP(self):
diff --git a/thirdparty/faiss/tests/test_index_binary.py b/thirdparty/faiss/tests/test_index_binary.py
index b505e0ba1..7820cb662 100644
--- a/thirdparty/faiss/tests/test_index_binary.py
+++ b/thirdparty/faiss/tests/test_index_binary.py
@@ -100,6 +100,9 @@ def test_flat(self):
         index.add(self.xb)
         D, I = index.search(self.xq, 3)
 
+        I2 = index.assign(x=self.xq, k=3, labels=None)
+        assert np.all(I == I2)
+
         for i in range(nq):
             for j, dj in zip(I[i], D[i]):
                 ref_dis = binary_dis(self.xq[i], self.xb[j])
@@ -139,7 +142,6 @@ def test_range_search(self):
                 self.assertTrue(set(range_res) <= set(I[i]))
                 nt2 += 1
             # in case of equality we have a problem with ties
-        print('nb tests', nt1, nt2)
         # nb tests is actually low...
         self.assertTrue(nt1 > 19 and nt2 > 19)
 
@@ -284,8 +286,6 @@ def test_ivf_nprobe(self):
         ref_index.add(xb)
         ref_D, ref_I = ref_index.search(xq, k)
 
-        print(D[0], ref_D[0])
-        print(I[0], ref_I[0])
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 
diff --git a/thirdparty/faiss/tests/test_index_composite.py b/thirdparty/faiss/tests/test_index_composite.py
index a760c0cf0..8d9b441ad 100644
--- a/thirdparty/faiss/tests/test_index_composite.py
+++ b/thirdparty/faiss/tests/test_index_composite.py
@@ -168,8 +168,6 @@ def test_remove_id_map_2(self):
         index.remove_ids(remove_set)
         index.add_with_ids(X[5:, :], idx[5:])
 
-        print (index.search(X, 1))
-
         for i in range(10):
             _, searchres = index.search(X[i:i + 1, :], 1)
             if idx[i] in remove_set:
@@ -954,7 +952,6 @@ def do_test(self, factory_string):
         index.nprobe = 10
         Dref, Iref = index.search(ds.get_queries(), 10)
 
-        #print(index.search_and_return_codes)
         D, I, codes = index.search_and_return_codes(
             ds.get_queries(), 10, include_listnos=True)
 
diff --git a/thirdparty/faiss/tests/test_io.py b/thirdparty/faiss/tests/test_io.py
index dc8ac3dcf..99dfe6084 100644
--- a/thirdparty/faiss/tests/test_io.py
+++ b/thirdparty/faiss/tests/test_io.py
@@ -102,7 +102,6 @@ def test_buf_read(self):
                 reader = faiss.BufferedIOReader(reader, bsz)
 
                 y = np.zeros_like(x)
-                print('nbytes=', y.nbytes)
                 reader(faiss.swig_ptr(y), y.nbytes, 1)
 
             np.testing.assert_array_equal(x, y)
diff --git a/thirdparty/faiss/tests/test_ivf_index.cpp b/thirdparty/faiss/tests/test_ivf_index.cpp
index 28e572e39..21ed0abdc 100644
--- a/thirdparty/faiss/tests/test_ivf_index.cpp
+++ b/thirdparty/faiss/tests/test_ivf_index.cpp
@@ -6,12 +6,14 @@
  */
 
 #include <omp.h>
+#include <algorithm>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <map>
 #include <random>
+#include <set>
 
 #include <gtest/gtest.h>
 
diff --git a/thirdparty/faiss/tests/test_ivflib.py b/thirdparty/faiss/tests/test_ivflib.py
index f19c3da45..0a3fb8c87 100644
--- a/thirdparty/faiss/tests/test_ivflib.py
+++ b/thirdparty/faiss/tests/test_ivflib.py
@@ -125,7 +125,6 @@ def test_range_search_with_parameters(self):
 
         Dpre, _ = index.search(xq, 15)
         radius = float(np.median(Dpre[:, -1]))
-        print("Radius=", radius)
         stats = faiss.cvar.indexIVF_stats
         stats.reset()
         Lref, Dref, Iref = index.range_search(xq, radius)
diff --git a/thirdparty/faiss/tests/test_local_search_quantizer.py b/thirdparty/faiss/tests/test_local_search_quantizer.py
index 22231358e..797592981 100644
--- a/thirdparty/faiss/tests/test_local_search_quantizer.py
+++ b/thirdparty/faiss/tests/test_local_search_quantizer.py
@@ -196,7 +196,6 @@ def test_update_codebooks_with_double(self):
         err_float = eval_codec(lsq, xb)
 
         # 6533.377 vs 25457.99
-        print(err_double, err_float)
         self.assertLess(err_double, err_float)
 
     def test_compute_binary_terms(self):
@@ -314,7 +313,7 @@ def test_icm_encode(self):
             n,
             1)
 
-        # do icm encoding without pre-computed unary and binary terms in Python
+        # do icm encoding without pre-computed unary and bianry terms in Python
         codebooks = faiss.vector_float_to_array(lsq.codebooks)
         codebooks = codebooks.reshape(M, K, d).copy()
         ref_codes = icm_encode_ref(x, codebooks, codes)
@@ -348,7 +347,6 @@ def test_training(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_lsq, err_pq)
         self.assertLess(err_lsq, err_pq)
 
 
@@ -463,7 +461,6 @@ def eval_index_accuracy(self, factory_key):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         inters = np.array(inters)
@@ -528,7 +525,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_plsq, err_pq)
         self.assertLess(err_plsq, err_pq)
 
     def test_with_lsq(self):
@@ -549,7 +545,6 @@ def test_with_lsq(self):
         lsq.train(xt)
         err_lsq = eval_codec(lsq, xb)
 
-        print(err_plsq, err_lsq)
         self.assertEqual(err_plsq, err_lsq)
 
     def test_lut(self):
@@ -664,7 +659,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as LSQ."""
         inter1 = self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF32,LSQ4x5_Nqint8")
-        # print(inter1, inter2)  # 381 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/thirdparty/faiss/tests/test_merge_index.py b/thirdparty/faiss/tests/test_merge_index.py
index 4417f57fe..bdcc813f1 100644
--- a/thirdparty/faiss/tests/test_merge_index.py
+++ b/thirdparty/faiss/tests/test_merge_index.py
@@ -72,7 +72,6 @@ def do_test_merge(self, index_type):
             index.merge_from(indexes[i], index.ntotal)
 
         _D, I = index.search(xq, k)
-        print(I[:5, :6])
 
         ndiff = (I != Iref).sum()
         print('%d / %d differences' % (ndiff, nq * k))
diff --git a/thirdparty/faiss/tests/test_meta_index.py b/thirdparty/faiss/tests/test_meta_index.py
index d53cad48f..d0896e8ba 100644
--- a/thirdparty/faiss/tests/test_meta_index.py
+++ b/thirdparty/faiss/tests/test_meta_index.py
@@ -82,10 +82,8 @@ def test_shards(self):
         k = 32
         ref_index = faiss.IndexFlatL2(d)
 
-        print('ref search')
         ref_index.add(xb)
         _Dref, Iref = ref_index.search(xq, k)
-        print(Iref[:5, :6])
 
         shard_index = faiss.IndexShards(d)
         shard_index_2 = faiss.IndexShards(d, True, False)
@@ -109,7 +107,6 @@ def test_shards(self):
         for test_no in range(3):
             with_threads = test_no == 1
 
-            print('shard search test_no = %d' % test_no)
             if with_threads:
                 remember_nt = faiss.omp_get_max_threads()
                 faiss.omp_set_num_threads(1)
@@ -122,14 +119,10 @@ def test_shards(self):
             else:
                 _D, I = shard_index_2.search(xq, k)
 
-            print(I[:5, :6])
-
             if with_threads:
                 faiss.omp_set_num_threads(remember_nt)
 
             ndiff = (I != Iref).sum()
-
-            print('%d / %d differences' % (ndiff, nq * k))
             assert (ndiff < nq * k / 1000.)
 
     def test_shards_ivf(self):
diff --git a/thirdparty/faiss/tests/test_partition.py b/thirdparty/faiss/tests/test_partition.py
index 02de7e8c2..fd41eabe1 100644
--- a/thirdparty/faiss/tests/test_partition.py
+++ b/thirdparty/faiss/tests/test_partition.py
@@ -49,7 +49,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -95,7 +94,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -148,7 +146,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
 
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -160,7 +157,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         tab_a = faiss.AlignedTableUint16()
         faiss.copy_array_to_AlignedTable(vals, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
@@ -196,7 +192,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -209,7 +204,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         vals_inv = (65535 - vals).astype('uint16')
         faiss.copy_array_to_AlignedTable(vals_inv, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
diff --git a/thirdparty/faiss/tests/test_product_quantizer.py b/thirdparty/faiss/tests/test_product_quantizer.py
index 1cdee7f14..f531cab2a 100644
--- a/thirdparty/faiss/tests/test_product_quantizer.py
+++ b/thirdparty/faiss/tests/test_product_quantizer.py
@@ -26,7 +26,6 @@ def test_pq(self):
         x2 = pq.decode(codes)
         diff = ((x - x2)**2).sum()
 
-        # print("diff=", diff)
         # diff= 4418.0562
         self.assertGreater(5000, diff)
 
@@ -71,7 +70,6 @@ def do_test_codec(self, nbit):
 
     def test_codec(self):
         for i in range(16):
-            print("Testing nbits=%d" % (i + 1))
             self.do_test_codec(i + 1)
 
 
diff --git a/thirdparty/faiss/tests/test_residual_quantizer.py b/thirdparty/faiss/tests/test_residual_quantizer.py
index e37ee3efe..f4381607e 100644
--- a/thirdparty/faiss/tests/test_residual_quantizer.py
+++ b/thirdparty/faiss/tests/test_residual_quantizer.py
@@ -211,7 +211,6 @@ def test_training(self):
 
         # in practice RQ is often better than PQ but it does not the case here, so just check
         # that we are within some factor.
-        # print(err_pq, err_rq)
         self.assertLess(err_rq, err_pq * 1.2)
 
     def test_beam_size(self):
@@ -321,10 +320,8 @@ def retrain_AQ_codebook(index, xt):
 
     x_decoded = index.sa_decode(codes_packed)
     MSE = ((xt - x_decoded) ** 2).sum() / n
-    # print(f"Initial MSE on training set: {MSE:g}")
 
     codes = unpack_codes(index.rq, codes_packed)
-    # print("ref codes", codes[0])
     codebook_offsets = faiss.vector_to_array(rq.codebook_offsets)
 
     # build sparse code matrix (represented as a dense matrix)
@@ -343,7 +340,6 @@ def retrain_AQ_codebook(index, xt):
         B, residuals, rank, singvals = scipy.linalg.lstsq(C, xt, )
 
     MSE = ((C @ B - xt) ** 2).sum() / n
-    # print(f"MSE after retrainining: {MSE:g}")
 
     # replace codebook
     # faiss.copy_array_to_vector(B.astype('float32').ravel(), index.rq.codebooks)
@@ -503,7 +499,6 @@ def test_reestimate_codebook_2(self):
         xt_decoded = ir.sa_decode(ir.sa_encode(xt))
         err_after_refined = ((xt - xt_decoded) ** 2).sum()
 
-        # print(err_before, err_after_refined)
         # ref run 7474.98 / 7006.1777
         self.assertGreater(err_before, err_after_refined * 1.06)
 
@@ -781,7 +776,6 @@ def test_search_L2(self):
             else:
                 inter_2 = faiss.eval_intersection(I2, gt)
                 self.assertGreaterEqual(inter_ref, inter_2)
-                # print(st, inter_ref, inter_2)
 
 
 ###########################################################
@@ -814,7 +808,6 @@ def do_test_accuracy(self, by_residual, st):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print(st, "nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         # do a little I/O test
@@ -909,18 +902,13 @@ def do_test_accuracy_IP(self, by_residual):
             D, I = index.search(ds.get_queries(), 10)
             index.rq.search_type = faiss.AdditiveQuantizer.ST_LUT_nonorm
             D2, I2 = index.search(ds.get_queries(), 10)
-            # print(D[:5] - D2[:5])
-            # print(I[:5])
             np.testing.assert_array_almost_equal(D, D2, decimal=5)
             # there are many ties because the codes are so short
             self.assertLess((I != I2).sum(), I.size * 0.1)
 
             # D2, I2 = index2.search(ds.get_queries(), 10)
-            # print(D[:5])
-            # print(D2[:5])
 
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
         self.assertTrue(np.all(inters[1:4] >= inters[:3]))
 
@@ -979,8 +967,6 @@ def beam_search_encode_step_tab(codes, L, distances, codebook_cross_prods_i,
             for b in range(beam_size):
                 dotprods[i, b, :] += cb[codes[i, b, j]]
 
-    # print("dps", dotprods[:3, :2, :4])
-
     new_distances += 2 * dotprods
     cent_distances = new_distances
 
@@ -1166,7 +1152,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        # print(err_prq, err_pq)
         self.assertLess(err_prq, err_pq)
 
     def test_with_rq(self):
@@ -1187,7 +1172,6 @@ def test_with_rq(self):
         rq.train(xt)
         err_rq = eval_codec(rq, xb)
 
-        # print(err_prq, err_rq)
         self.assertEqual(err_prq, err_rq)
 
 
@@ -1271,7 +1255,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as RQ."""
         inter1 = self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF100,RQ4x5_Nqint8")
-        # print(inter1, inter2)  # 392 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/thirdparty/faiss/tests/test_rowwise_minmax.py b/thirdparty/faiss/tests/test_rowwise_minmax.py
index dbd14de38..53e6c00b1 100644
--- a/thirdparty/faiss/tests/test_rowwise_minmax.py
+++ b/thirdparty/faiss/tests/test_rowwise_minmax.py
@@ -45,7 +45,6 @@ def compare_train_vs_train_inplace(self, factory_key):
 
         # make sure that the reconstruction error is not crazy
         reconstruction_err = ((x - decoded) ** 2).sum()
-        print(reconstruction_err)
 
         self.assertLess(reconstruction_err, 0.6)
 
diff --git a/thirdparty/faiss/tests/test_search_params.py b/thirdparty/faiss/tests/test_search_params.py
index 22b845c2e..886ffc0c6 100644
--- a/thirdparty/faiss/tests/test_search_params.py
+++ b/thirdparty/faiss/tests/test_search_params.py
@@ -465,7 +465,6 @@ def test_12_92(self):
         sp = faiss.swig_ptr
         selr.find_sorted_ids_bounds(
             len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
-        print(j01)
         assert j01[0] >= j01[1]
 
 
diff --git a/thirdparty/faiss/tests/test_standalone_codec.py b/thirdparty/faiss/tests/test_standalone_codec.py
index 7fdcf6849..391b88b9d 100644
--- a/thirdparty/faiss/tests/test_standalone_codec.py
+++ b/thirdparty/faiss/tests/test_standalone_codec.py
@@ -151,7 +151,6 @@ def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
             err = ((x - x2) ** 2).sum()
             errs.append(err)
 
-        print(errs)
         self.assertGreater(errs[0], errs[1])
 
         self.assertGreater(max_errs[0], errs[0])
@@ -174,6 +173,9 @@ def test_SQ2(self):
     def test_SQ3(self):
         self.compare_accuracy('SQ8', 'SQfp16')
 
+    def test_SQ4(self):
+        self.compare_accuracy('SQ8', 'SQbf16')
+
     def test_PQ(self):
         self.compare_accuracy('PQ6x8np', 'PQ8x8np')
 
@@ -214,7 +216,6 @@ def test_repeats(self):
             code = repeats.encode(swig_ptr(vec))
             vec2 = np.zeros(dim, dtype='float32')
             repeats.decode(code, swig_ptr(vec2))
-            # print(vec2)
             assert np.all(vec == vec2)
 
     def test_ZnSphereCodec_encode_centroid(self):
@@ -222,7 +223,6 @@ def test_ZnSphereCodec_encode_centroid(self):
         r2 = 5
         ref_codec = faiss.ZnSphereCodec(dim, r2)
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print(ref_codec.nv, codec.nv)
         assert ref_codec.nv == codec.nv
         s = set()
         for i in range(ref_codec.nv):
@@ -237,7 +237,6 @@ def test_ZnSphereCodecRec(self):
         dim = 16
         r2 = 6
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print("nv=", codec.nv)
         for i in range(codec.nv):
             c = np.zeros(dim, dtype='float32')
             codec.decode(i, swig_ptr(c))
@@ -300,15 +299,10 @@ def test_rw(self):
         for i in range(nbyte):
             self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
 
-        #for i in range(nbyte):
-        #    print(bin(bs[i] + 256)[3:], end=' ')
-        # print()
-
         br = faiss.BitstringReader(swig_ptr(bs), nbyte)
 
         for nbit, xref in ctrl:
             xnew = br.read(nbit)
-            # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
     def test_arrays(self):
diff --git a/thirdparty/faiss/tutorial/cpp/1-Flat.cpp b/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
index 819e41957..147fa89bc 100644
--- a/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
+++ b/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
@@ -83,10 +83,10 @@ int main() {
             printf("\n");
         }
 
-        printf("I (5 last results)=\n");
+        printf("D (5 last results)=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp b/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
index febd5be04..86530ae98 100644
--- a/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
+++ b/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
@@ -61,13 +61,10 @@ int main() {
             printf("\n");
         }
 
-        index.nprobe = 10;
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
+        printf("D=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/thirdparty/faiss/tutorial/cpp/6-HNSW.cpp b/thirdparty/faiss/tutorial/cpp/6-HNSW.cpp
new file mode 100644
index 000000000..9bd8cd3fa
--- /dev/null
+++ b/thirdparty/faiss/tutorial/cpp/6-HNSW.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexHNSW.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[d * nb];
+    float* xq = new float[d * nq];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++)
+            xb[d * i + j] = distrib(rng);
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++)
+            xq[d * i + j] = distrib(rng);
+        xq[d * i] += i / 1000.;
+    }
+
+    int k = 4;
+
+    faiss::IndexHNSWFlat index(d, 32);
+    index.add(nb, xb);
+
+    { // search xq
+        idx_t* I = new idx_t[k * nq];
+        float* D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5zd ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("D=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5f ", D[i * k + j]);
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/thirdparty/faiss/tutorial/cpp/7-PQFastScan.cpp b/thirdparty/faiss/tutorial/cpp/7-PQFastScan.cpp
new file mode 100644
index 000000000..4cdfea052
--- /dev/null
+++ b/thirdparty/faiss/tutorial/cpp/7-PQFastScan.cpp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.train(nb, xb);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.add(nb, xb);
+
+    int k = 4;
+
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+} // namespace facebook::detail
diff --git a/thirdparty/faiss/tutorial/cpp/8-PQFastScanRefine.cpp b/thirdparty/faiss/tutorial/cpp/8-PQFastScanRefine.cpp
new file mode 100644
index 000000000..2435d94d2
--- /dev/null
+++ b/thirdparty/faiss/tutorial/cpp/8-PQFastScanRefine.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    faiss::IndexRefineFlat index_refine(&index);
+    // refine index after PQFastScan
+
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.train(nb, xb);
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.add(nb, xb);
+
+    int k = 4;
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+        index_refine.search(nq, xq, k, D, I, params);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+        delete params;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/thirdparty/faiss/tutorial/cpp/9-RefineComparison.cpp b/thirdparty/faiss/tutorial/cpp/9-RefineComparison.cpp
new file mode 100644
index 000000000..d7fbc90ae
--- /dev/null
+++ b/thirdparty/faiss/tutorial/cpp/9-RefineComparison.cpp
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/index_factory.h>
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    // Constructing the refine PQ index with SQfp16 with index factory
+    faiss::Index* index_fp16;
+    index_fp16 = faiss::index_factory(
+            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
+    index_fp16->train(nb, xb);
+    index_fp16->add(nb, xb);
+
+    // Constructing the refine PQ index with SQ8
+    faiss::Index* index_sq8;
+    index_sq8 =
+            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
+    index_sq8->train(nb, xb);
+    index_sq8->add(nb, xb);
+
+    int k = 10;
+    { // search xq
+        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
+        float* D_fp16 = new float[(int)(k * nq)];
+        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
+        float* D_sq8 = new float[(int)(k * nq)];
+
+        // Parameterization on k factor while doing search for index refinement
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+
+        // Perform index search using different index refinement
+        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
+        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
+
+        printf("I_fp16=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_fp16[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        printf("I_sq8=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_sq8[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I_fp16;
+        delete[] D_fp16;
+        delete[] I_sq8;
+        delete[] D_sq8;
+        delete params;
+
+        delete index_fp16;
+        delete index_sq8;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/thirdparty/faiss/tutorial/cpp/CMakeLists.txt b/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
index 7361b33a0..f964b3dda 100644
--- a/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
+++ b/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
@@ -18,3 +18,15 @@ target_link_libraries(4-GPU PRIVATE faiss)
 
 add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
 target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
+
+add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
+target_link_libraries(6-HNSW PRIVATE faiss)
+
+add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
+target_link_libraries(7-PQFastScan PRIVATE faiss)
+
+add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
+target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
+
+add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
+target_link_libraries(9-RefineComparison PRIVATE faiss)
diff --git a/thirdparty/faiss/tutorial/python/7-PQFastScan.py b/thirdparty/faiss/tutorial/python/7-PQFastScan.py
new file mode 100644
index 000000000..34d7a34ac
--- /dev/null
+++ b/thirdparty/faiss/tutorial/python/7-PQFastScan.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8   # 8 specifies that the number of sub-vector is 8
+k = 4   # number of dimension in etracted vector
+n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
+# construct FastScan Index
+
+assert not index.is_trained
+index.train(xb)     # Train vectors data index within mockup database
+assert index.is_trained
+
+index.add(xb)
+D, I = index.search(xb[:5], k)  # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])               # neighbors of the 5 last queries
diff --git a/thirdparty/faiss/tutorial/python/8-PQFastScanRefine.py b/thirdparty/faiss/tutorial/python/8-PQFastScanRefine.py
new file mode 100644
index 000000000..115a036fa
--- /dev/null
+++ b/thirdparty/faiss/tutorial/python/8-PQFastScanRefine.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8  # 8 specifies that the number of sub-vector is 8
+k = 4  # number of dimension in etracted vector
+n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
+
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
+index_refine = faiss.IndexRefineFlat(index)
+# construct FastScan and run index refinement
+
+assert not index_refine.is_trained
+index_refine.train(xb)  # Train vectors data index within mockup database
+assert index_refine.is_trained
+
+index_refine.add(xb)
+params = faiss.IndexRefineSearchParameters(k_factor=3)
+D, I = index_refine.search(xq[:5], 10, params=params)
+print(I)
+print(D)
+index.nprobe = 10  # make comparable with experiment above
+D, I = index.search(xq[:5], k)  # search
+print(I[-5:])
diff --git a/thirdparty/faiss/tutorial/python/9-RefineComparison.py b/thirdparty/faiss/tutorial/python/9-RefineComparison.py
new file mode 100644
index 000000000..6fa69f33d
--- /dev/null
+++ b/thirdparty/faiss/tutorial/python/9-RefineComparison.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+
+from faiss.contrib.evaluation import knn_intersection_measure
+from faiss.contrib import datasets
+
+# 64-dim vectors, 50000 vectors in the training, 100000 in database,
+# 10000 in queries, dtype ('float32')
+ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
+d = 64                           # dimension
+
+# Constructing the refine PQ index with SQfp16 with index factory
+index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
+index_fp16.train(ds.get_train())
+index_fp16.add(ds.get_database())
+
+# Constructing the refine PQ index with SQ8
+index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
+index_sq8.train(ds.get_train())
+index_sq8.add(ds.get_database())
+
+# Parameterization on k factor while doing search for index refinement
+k_factor = 3.0
+params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
+
+# Perform index search using different index refinement
+D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
+D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
+
+# Calculating knn intersection measure for different index types on refinement
+KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
+KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
+
+# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
+assert (KIM_fp16 > KIM_sq8)
+
+print(I_sq8[:5])
+print(I_fp16[:5])