From 861d194fefbe602c8c310824a0e8e1f6aae2f752 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 8 Jan 2019 11:31:46 -0500
Subject: [PATCH 01/87] For #669. Adding install target to gpu Makefile

---
 gpu/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/gpu/Makefile b/gpu/Makefile
index 4714eda7c7..072e089ddd 100644
--- a/gpu/Makefile
+++ b/gpu/Makefile
@@ -85,4 +85,12 @@ depend:
 	    $(CXXCPP) $(CPPFLAGS) -x c++ -MM $$i; \
 	done > depend
 
+install: libgpufaiss.a libgpufaiss.$(SHAREDEXT) installdirs
+	cp libgpufaiss.a libgpufaiss.$(SHAREDEXT) $(DESTDIR)$(libdir)
+	cp *.h $(DESTDIR)$(includedir)/faiss/gpu
+	cp --parents **/**.h $(DESTDIR)$(includedir)/faiss/gpu
+
+installdirs:
+	$(MKDIR_P) $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss/gpu
+
 .PHONY: all clean

From 60d654fd29ced2d9002d456535a08871e0c9b8e9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 3 Jun 2022 09:10:13 -0400
Subject: [PATCH 02/87] Starting integration of raft

---
 CMakeLists.txt                     |  18 +++
 cmake/thirdparty/get_raft.cmake    |  51 +++++++
 faiss/gpu/CMakeLists.txt           |   7 +-
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 223 +++++++++++++++++++++++++++++
 faiss/gpu/raft/RaftIndexIVFFlat.h  |  48 +++++++
 5 files changed, 346 insertions(+), 1 deletion(-)
 create mode 100644 cmake/thirdparty/get_raft.cmake
 create mode 100644 faiss/gpu/raft/RaftIndexIVFFlat.cu
 create mode 100644 faiss/gpu/raft/RaftIndexIVFFlat.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7b1fc6ce3..71a05ab7dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,18 @@
 
 cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake
+        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cuda_init_architectures(faiss)
+
+
 project(faiss
   VERSION 1.6.4
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
@@ -20,6 +32,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Valid values are "generic", "avx2".
 option(FAISS_OPT_LEVEL "" "generic")
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
+option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
 
@@ -28,6 +41,11 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
+if(FAISS_ENABLE_RAFT)
+  rapids_cpm_init()
+  include(cmake/thirdparty/get_raft.cmake)
+endif()
+
 add_subdirectory(faiss)
 
 if(FAISS_ENABLE_GPU)
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
new file mode 100644
index 0000000000..b171570137
--- /dev/null
+++ b/cmake/thirdparty/get_raft.cmake
@@ -0,0 +1,51 @@
+#=============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+
+set(RAFT_VERSION "22.04")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
+
+function(find_and_configure_raft)
+    set(oneValueArgs VERSION FORK PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    #-----------------------------------------------------
+    # Invoke CPM find_package()
+    #-----------------------------------------------------
+    rapids_cpm_find(raft ${PKG_VERSION}
+            GLOBAL_TARGETS      raft::raft
+            BUILD_EXPORT_SET    projname-exports
+            INSTALL_EXPORT_SET  projname-exports
+            CPM_ARGS
+            GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
+            GIT_TAG        ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR  cpp
+            OPTIONS
+            "BUILD_TESTS OFF"
+            "BUILD_BENCH OFF"
+            "RAFT_COMPILE_LIBRARIES OFF"
+            )
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
+        FORK             ${RAFT_FORK}
+        PINNED_TAG       ${RAFT_PINNED_TAG}
+        )
\ No newline at end of file
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 16af761a03..30a45a7cbd 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -162,6 +162,11 @@ set(FAISS_GPU_HEADERS
   utils/warpselect/WarpSelectImpl.cuh
 )
 
+if(FAISS_ENABLE_RAFT)
+  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h)
+  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu)
+endif()
+
 # Export FAISS_GPU_HEADERS variable to parent scope.
 set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 
@@ -176,7 +181,7 @@ foreach(header ${FAISS_GPU_HEADERS})
 endforeach()
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas)
+target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>)
 target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas)
 target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
 target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
new file mode 100644
index 0000000000..b411e0180a
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -0,0 +1,223 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+
+#include <raft/core/handle.hpp>
+
+#include <limits>
+
+namespace faiss {
+namespace gpu {
+
+RaftIndexIVFFlat::RaftIndexIVFFlat(
+        GpuResourcesProvider* provider,
+        const faiss::IndexIVFFlat* index,
+        GpuIndexIVFFlatConfig config)
+        : GpuIndexIVF(
+        provider,
+        index->d,
+        index->metric_type,
+        index->metric_arg,
+        index->nlist,
+        config),
+          ivfFlatConfig_(config),
+          reserveMemoryVecs_(0) {
+    copyFrom(index);
+}
+
+RaftIndexIVFFlat::RaftIndexIVFFlat(
+        GpuResourcesProvider* provider,
+        int dims,
+        int nlist,
+        faiss::MetricType metric,
+        GpuIndexIVFFlatConfig config)
+        : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
+          ivfFlatConfig_(config),
+          reserveMemoryVecs_(0) {
+    // faiss::Index params
+    this->is_trained = false;
+
+    // We haven't trained ourselves, so don't construct the IVFFlat
+    // index yet
+}
+
+RaftIndexIVFFlat::~RaftIndexIVFFlat() {}
+
+void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
+    reserveMemoryVecs_ = numVecs;
+    if (index_) {
+        DeviceScope scope(config_.device);
+        index_->reserveMemory(numVecs);
+    }
+}
+
+void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
+    DeviceScope scope(config_.device);
+
+    GpuIndexIVF::copyFrom(index);
+
+    // Clear out our old data
+    index_.reset();
+
+    // The other index might not be trained
+    if (!index->is_trained) {
+        FAISS_ASSERT(!is_trained);
+        return;
+    }
+
+    // Otherwise, we can populate ourselves from the other index
+    FAISS_ASSERT(is_trained);
+
+    // Copy our lists as well
+    index_.reset(new IVFFlat(
+            resources_.get(),
+            quantizer->getGpuData(),
+            index->metric_type,
+            index->metric_arg,
+            false,   // no residual
+            nullptr, // no scalar quantizer
+            ivfFlatConfig_.interleavedLayout,
+            ivfFlatConfig_.indicesOptions,
+            config_.memorySpace));
+
+    // Copy all of the IVF data
+    index_->copyInvertedListsFrom(index->invlists);
+}
+
+void RaftIndexIVFFlat::reset() {
+    if (index_) {
+        DeviceScope scope(config_.device);
+
+        index_->reset();
+        this->ntotal = 0;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
+void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    DeviceScope scope(config_.device);
+
+    if (this->is_trained) {
+        FAISS_ASSERT(quantizer->is_trained);
+        FAISS_ASSERT(quantizer->ntotal == nlist);
+        FAISS_ASSERT(index_);
+        return;
+    }
+
+    FAISS_ASSERT(!index_);
+
+    // FIXME: GPUize more of this
+    // First, make sure that the data is resident on the CPU, if it is not on
+    // the CPU, as we depend upon parts of the CPU code
+    auto hostData = toHost<float, 2>(
+            (float*)x,
+            resources_->getDefaultStream(config_.device),
+            {(int)n, (int)this->d});
+
+    // TODO: I think this can be done on GPU through RAFT k-means
+    trainQuantizer_(n, hostData.data());
+
+    // The quantizer is now trained; construct the IVF index
+    index_.reset(new IVFFlat(
+            resources_.get(),
+            quantizer->getGpuData(),
+            this->metric_type,
+            this->metric_arg,
+            false,   // no residual
+            nullptr, // no scalar quantizer
+            ivfFlatConfig_.interleavedLayout,
+            ivfFlatConfig_.indicesOptions,
+            config_.memorySpace));
+
+    if (reserveMemoryVecs_) {
+        index_->reserveMemory(reserveMemoryVecs_);
+    }
+
+    this->is_trained = true;
+}
+
+int RaftIndexIVFFlat::getListLength(int listId) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListLength(listId);
+}
+
+std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
+        int listId,
+        bool gpuFormat) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListVectorData(listId, gpuFormat);
+}
+
+std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListIndices(listId);
+}
+
+void RaftIndexIVFFlat::addImpl_(
+        int n,
+        const float* x,
+        const Index::idx_t* xids) {
+    // Device is already set in GpuIndex::add
+    FAISS_ASSERT(index_);
+    FAISS_ASSERT(n > 0);
+
+    // Data is already resident on the GPU
+    Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
+
+    // Not all vectors may be able to be added (some may contain NaNs etc)
+    index_->addVectors(data, labels);
+
+    // but keep the ntotal based on the total number of vectors that we
+    // attempted to add
+    ntotal += n;
+}
+
+void RaftIndexIVFFlat::searchImpl_(
+        int n,
+        const float* x,
+        int k,
+        float* distances,
+        Index::idx_t* labels) const {
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(index_);
+    FAISS_ASSERT(n > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+
+    // Data is already resident on the GPU
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<Index::idx_t, 2, true> outLabels(
+            const_cast<Index::idx_t*>(labels), {n, k});
+
+    index_->query(queries, nprobe, k, outDistances, outLabels);
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
new file mode 100644
index 0000000000..0ae2a8535a
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <memory>
+
+namespace faiss {
+struct IndexIVFFlat;
+}
+
+namespace faiss {
+namespace gpu {
+
+class IVFFlat;
+class GpuIndexFlat;
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::gpu::GpuIndexIVFFlat
+class RaftIndexIVFFlat : public GpuIndexIVFFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying
+    /// data over to the given GPU, if the input index is trained.
+    RaftIndexIVFFlat(
+            GpuResourcesProvider* provider,
+            const faiss::IndexIVFFlat* index,
+            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
+
+    /// Constructs a new instance with an empty flat quantizer; the user
+    /// provides the number of lists desired.
+    RaftIndexIVFFlat(
+            GpuResourcesProvider* provider,
+            int dims,
+            int nlist,
+            faiss::MetricType metric,
+            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
+
+    ~RaftIndexIVFFlat() override;
+};
+
+} // namespace gpu
+} // namespace faiss

From d474bf3d3d798e57027f7eeb550890916b213152 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 3 Jun 2022 11:26:46 -0400
Subject: [PATCH 03/87] Adding proper inherited member definitions

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 16 +++-------
 faiss/gpu/raft/RaftIndexIVFFlat.h  | 51 ++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index b411e0180a..638d1b56c3 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -9,6 +9,7 @@
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/impl/IVFFlat.cuh>
@@ -26,15 +27,10 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         GpuResourcesProvider* provider,
         const faiss::IndexIVFFlat* index,
         GpuIndexIVFFlatConfig config)
-        : GpuIndexIVF(
+        : GpuIndexIVFFlat(
         provider,
-        index->d,
-        index->metric_type,
-        index->metric_arg,
-        index->nlist,
-        config),
-          ivfFlatConfig_(config),
-          reserveMemoryVecs_(0) {
+        index,
+        config) {
     copyFrom(index);
 }
 
@@ -44,9 +40,7 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         int nlist,
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
-        : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
-          ivfFlatConfig_(config),
-          reserveMemoryVecs_(0) {
+        : GpuIndexIVFFlat(provider, dims, nlist, metric, config) {
     // faiss::Index params
     this->is_trained = false;
 
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 0ae2a8535a..2de0782a85 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -42,6 +42,57 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
 
     ~RaftIndexIVFFlat() override;
+
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(size_t numVecs);
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexIVFFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexIVFFlat* index) const;
+
+    /// After adding vectors, one can call this to reclaim device memory
+    /// to exactly the amount needed. Returns space reclaimed in bytes
+    size_t reclaimMemory();
+
+    /// Clears out all inverted lists, but retains the coarse centroid
+    /// information
+    void reset() override;
+
+    /// Trains the coarse quantizer based on the given vector data
+    void train(Index::idx_t n, const float* x) override;
+
+    /// Returns the number of vectors present in a particular inverted list
+    int getListLength(int listId) const override;
+
+    /// Return the encoded vector data contained in a particular inverted list,
+    /// for debugging purposes.
+    /// If gpuFormat is true, the data is returned as it is encoded in the
+    /// GPU-side representation.
+    /// Otherwise, it is converted to the CPU format.
+    /// compliant format, while the native GPU format may differ.
+    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
+    const override;
+
+    /// Return the vector indices contained in a particular inverted list, for
+    /// debugging purposes.
+    std::vector<Index::idx_t> getListIndices(int listId) const override;
+
+   protected:
+    /// Called from GpuIndex for add/add_with_ids
+    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
 };
 
 } // namespace gpu

From 8baee52ce307d12c57a47c205a5e873266652455 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 3 Jun 2022 11:47:30 -0400
Subject: [PATCH 04/87] Updating raft ivf flat

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 51 ------------------------------
 faiss/gpu/raft/RaftIndexIVFFlat.h  | 15 ---------
 2 files changed, 66 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 638d1b56c3..ac2f439141 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -50,57 +50,6 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
 
 RaftIndexIVFFlat::~RaftIndexIVFFlat() {}
 
-void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
-    reserveMemoryVecs_ = numVecs;
-    if (index_) {
-        DeviceScope scope(config_.device);
-        index_->reserveMemory(numVecs);
-    }
-}
-
-void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    DeviceScope scope(config_.device);
-
-    GpuIndexIVF::copyFrom(index);
-
-    // Clear out our old data
-    index_.reset();
-
-    // The other index might not be trained
-    if (!index->is_trained) {
-        FAISS_ASSERT(!is_trained);
-        return;
-    }
-
-    // Otherwise, we can populate ourselves from the other index
-    FAISS_ASSERT(is_trained);
-
-    // Copy our lists as well
-    index_.reset(new IVFFlat(
-            resources_.get(),
-            quantizer->getGpuData(),
-            index->metric_type,
-            index->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace));
-
-    // Copy all of the IVF data
-    index_->copyInvertedListsFrom(index->invlists);
-}
-
-void RaftIndexIVFFlat::reset() {
-    if (index_) {
-        DeviceScope scope(config_.device);
-
-        index_->reset();
-        this->ntotal = 0;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
 
 void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     // For now, only support <= max int results
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 2de0782a85..df2dbd2060 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -43,21 +43,6 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
 
     ~RaftIndexIVFFlat() override;
 
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexIVFFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexIVFFlat* index) const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
     /// Clears out all inverted lists, but retains the coarse centroid
     /// information
     void reset() override;

From 2eb94f1d538434edc53745a312f790c7734491ec Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 14 Jun 2022 15:43:53 -0400
Subject: [PATCH 05/87] adding raftIVFFlat implementation

---
 faiss/gpu/CMakeLists.txt           |   4 +-
 faiss/gpu/raft/RaftIVFFlat.cu      | 267 +++++++++++++++++++++++++++++
 faiss/gpu/raft/RaftIVFFlat.cuh     |  82 +++++++++
 faiss/gpu/raft/RaftIndexIVFFlat.cu |   4 +-
 faiss/gpu/raft/RaftIndexIVFFlat.h  |   2 +-
 5 files changed, 354 insertions(+), 5 deletions(-)
 create mode 100644 faiss/gpu/raft/RaftIVFFlat.cu
 create mode 100644 faiss/gpu/raft/RaftIVFFlat.cuh

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 30a45a7cbd..8a3341d094 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS
 )
 
 if(FAISS_ENABLE_RAFT)
-  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h)
-  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu)
+  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIVFFlat.cuh)
+  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIVFFlat.cu)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu
new file mode 100644
index 0000000000..b51a7d9dc0
--- /dev/null
+++ b/faiss/gpu/raft/RaftIVFFlat.cu
@@ -0,0 +1,267 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <thrust/host_vector.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/raft/RaftIVFFlat.cuh>
+#include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/IVFInterleaved.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <limits>
+#include <unordered_map>
+
+namespace faiss {
+namespace gpu {
+
+RaftIVFFlat::RaftIVFFlat(
+        GpuResources* res,
+        FlatIndex* quantizer,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space) :
+          IVFFlat(res, quantizer, metric, metricArg, useResidual, scalarQ, interleavedLayout, indicesOptions, space) {}
+
+RaftIVFFlat::~RaftIVFFlat() {}
+
+size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
+    if (interleavedLayout_) {
+        // bits per scalar code
+        int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
+
+        // bytes to encode a block of 32 vectors (single dimension)
+        int bytesPerDimBlock = bits * 32 / 8;
+
+        // bytes to fully encode 32 vectors
+        int bytesPerBlock = bytesPerDimBlock * dim_;
+
+        // number of blocks of 32 vectors we have
+        int numBlocks = utils::divUp(numVecs, 32);
+
+        // total size to encode numVecs
+        return bytesPerBlock * numBlocks;
+    } else {
+        size_t sizePerVector =
+                (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
+
+        return (size_t)numVecs * sizePerVector;
+    }
+}
+
+size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const {
+    size_t sizePerVector =
+            (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
+
+    return (size_t)numVecs * sizePerVector;
+}
+
+std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
+        std::vector<uint8_t> codes,
+        size_t numVecs) const {
+    if (!interleavedLayout_) {
+        // same format
+        return codes;
+    }
+
+    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+
+    auto up =
+            unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
+    return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
+}
+
+std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
+        std::vector<uint8_t> codes,
+        size_t numVecs) const {
+    if (!interleavedLayout_) {
+        // same format
+        return codes;
+    }
+
+    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+
+    auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
+    return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
+}
+
+void RaftIVFFlat::appendVectors_(
+        Tensor<float, 2, true>& vecs,
+        Tensor<Index::idx_t, 1, true>& indices,
+        Tensor<int, 1, true>& uniqueLists,
+        Tensor<int, 1, true>& vectorsByUniqueList,
+        Tensor<int, 1, true>& uniqueListVectorStart,
+        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<int, 1, true>& listIds,
+        Tensor<int, 1, true>& listOffset,
+        cudaStream_t stream) {
+    //
+    // Append the new encodings
+    //
+
+    // Calculate residuals for these vectors, if needed
+    DeviceTensor<float, 2, true> residuals(
+            resources_,
+            makeTempAlloc(AllocType::Other, stream),
+            {vecs.getSize(0), dim_});
+
+    if (useResidual_) {
+        quantizer_->computeResidual(vecs, listIds, residuals);
+    }
+
+    // Append indices to the IVF lists
+    runIVFIndicesAppend(
+            listIds,
+            listOffset,
+            indices,
+            indicesOptions_,
+            deviceListIndexPointers_,
+            stream);
+
+    // Append the encoded vectors to the IVF lists
+    if (interleavedLayout_) {
+        runIVFFlatInterleavedAppend(
+                listIds,
+                listOffset,
+                uniqueLists,
+                vectorsByUniqueList,
+                uniqueListVectorStart,
+                uniqueListStartOffset,
+                useResidual_ ? residuals : vecs,
+                scalarQ_.get(),
+                deviceListDataPointers_,
+                resources_,
+                stream);
+    } else {
+        runIVFFlatAppend(
+                listIds,
+                listOffset,
+                useResidual_ ? residuals : vecs,
+                scalarQ_.get(),
+                deviceListDataPointers_,
+                stream);
+    }
+}
+
+void RaftIVFFlat::query(
+        Tensor<float, 2, true>& queries,
+        int nprobe,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<Index::idx_t, 2, true>& outIndices) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    // These are caught at a higher level
+    FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
+    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+    nprobe = std::min(nprobe, quantizer_->getSize());
+
+    FAISS_ASSERT(queries.getSize(1) == dim_);
+
+    FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
+    FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
+
+    // Reserve space for the quantized information
+    DeviceTensor<float, 2, true> coarseDistances(
+            resources_,
+            makeTempAlloc(AllocType::Other, stream),
+            {queries.getSize(0), nprobe});
+    DeviceTensor<int, 2, true> coarseIndices(
+            resources_,
+            makeTempAlloc(AllocType::Other, stream),
+            {queries.getSize(0), nprobe});
+
+    // Find the `nprobe` closest lists; we can use int indices both
+    // internally and externally
+    quantizer_->query(
+            queries,
+            nprobe,
+            metric_,
+            metricArg_,
+            coarseDistances,
+            coarseIndices,
+            false);
+
+    DeviceTensor<float, 3, true> residualBase(
+            resources_,
+            makeTempAlloc(AllocType::Other, stream),
+            {queries.getSize(0), nprobe, dim_});
+
+    if (useResidual_) {
+        // Reconstruct vectors from the quantizer
+        quantizer_->reconstruct(coarseIndices, residualBase);
+    }
+
+    if (interleavedLayout_) {
+        runIVFInterleavedScan(
+                queries,
+                coarseIndices,
+                deviceListDataPointers_,
+                deviceListIndexPointers_,
+                indicesOptions_,
+                deviceListLengths_,
+                k,
+                metric_,
+                useResidual_,
+                residualBase,
+                scalarQ_.get(),
+                outDistances,
+                outIndices,
+                resources_);
+    } else {
+        runIVFFlatScan(
+                queries,
+                coarseIndices,
+                deviceListDataPointers_,
+                deviceListIndexPointers_,
+                indicesOptions_,
+                deviceListLengths_,
+                maxListLength_,
+                k,
+                metric_,
+                useResidual_,
+                residualBase,
+                scalarQ_.get(),
+                outDistances,
+                outIndices,
+                resources_);
+    }
+
+    // If the GPU isn't storing indices (they are on the CPU side), we
+    // need to perform the re-mapping here
+    // FIXME: we might ultimately be calling this function with inputs
+    // from the CPU, these are unnecessary copies
+    if (indicesOptions_ == INDICES_CPU) {
+        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
+
+        ivfOffsetToUserIndex(
+                hostOutIndices.data(),
+                numLists_,
+                hostOutIndices.getSize(0),
+                hostOutIndices.getSize(1),
+                listOffsetToUserIndex_);
+
+        // Copy back to GPU, since the input to this function is on the
+        // GPU
+        outIndices.copyFrom(hostOutIndices, stream);
+    }
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh
new file mode 100644
index 0000000000..700a30cc3c
--- /dev/null
+++ b/faiss/gpu/raft/RaftIVFFlat.cuh
@@ -0,0 +1,82 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+
+namespace faiss {
+namespace gpu {
+
+class RaftIVFFlat : public IVFFlat {
+   public:
+    /// Construct from a quantizer that has elemen
+    RaftIVFFlat(
+            GpuResources* resources,
+            /// We do not own this reference
+            FlatIndex* quantizer,
+            faiss::MetricType metric,
+            float metricArg,
+            bool useResidual,
+            /// Optional ScalarQuantizer
+            faiss::ScalarQuantizer* scalarQ,
+            bool interleavedLayout,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
+    ~RaftIVFFlat() override;
+
+    /// Find the approximate k nearest neighbors for `queries` against
+    /// our database
+    void query(
+            Tensor<float, 2, true>& queries,
+            int nprobe,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<Index::idx_t, 2, true>& outIndices);
+
+   protected:
+    /// Returns the number of bytes in which an IVF list containing numVecs
+    /// vectors is encoded on the device. Note that due to padding this is not
+    /// the same as the encoding size for a subset of vectors in an IVF list;
+    /// this is the size for an entire IVF list
+    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
+    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
+
+    /// Translate to our preferred GPU encoding
+    std::vector<uint8_t> translateCodesToGpu_(
+            std::vector<uint8_t> codes,
+            size_t numVecs) const override;
+
+    /// Translate from our preferred GPU encoding
+    std::vector<uint8_t> translateCodesFromGpu_(
+            std::vector<uint8_t> codes,
+            size_t numVecs) const override;
+
+    /// Encode the vectors that we're adding and append to our IVF lists
+    void appendVectors_(
+            Tensor<float, 2, true>& vecs,
+            Tensor<Index::idx_t, 1, true>& indices,
+            Tensor<int, 1, true>& uniqueLists,
+            Tensor<int, 1, true>& vectorsByUniqueList,
+            Tensor<int, 1, true>& uniqueListVectorStart,
+            Tensor<int, 1, true>& uniqueListStartOffset,
+            Tensor<int, 1, true>& listIds,
+            Tensor<int, 1, true>& listOffset,
+            cudaStream_t stream) override;
+
+   protected:
+    /// Do we encode the residual from a coarse quantizer or not?
+    bool useResidual_;
+
+    /// Scalar quantizer for encoded vectors, if any
+    std::unique_ptr<GpuScalarQuantizer> scalarQ_;
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index ac2f439141..50fa155465 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -12,7 +12,7 @@
 #include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/raft/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
@@ -81,7 +81,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     trainQuantizer_(n, hostData.data());
 
     // The quantizer is now trained; construct the IVF index
-    index_.reset(new IVFFlat(
+    index_.reset(new RaftIVFFlat(
             resources_.get(),
             quantizer->getGpuData(),
             this->metric_type,
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index df2dbd2060..206738a834 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -18,7 +18,7 @@ struct IndexIVFFlat;
 namespace faiss {
 namespace gpu {
 
-class IVFFlat;
+class RaftIVFFlat;
 class GpuIndexFlat;
 
 /// Wrapper around the GPU implementation that looks like

From f7d4185a5b858dea138dc43bbe2fc65c442a04ff Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 14 Jun 2022 16:30:18 -0400
Subject: [PATCH 06/87] Isolating quantizer training

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 36 +++++++++++++++++++++++++++++-
 faiss/gpu/raft/RaftIndexIVFFlat.h  |  2 ++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 50fa155465..7cebf3a8da 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -78,9 +78,11 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
             {(int)n, (int)this->d});
 
     // TODO: I think this can be done on GPU through RAFT k-means
-    trainQuantizer_(n, hostData.data());
+    trainQuantizer_impl(n, hostData.data());
 
     // The quantizer is now trained; construct the IVF index
+
+    // TODO: The underlying RaftIVFFlat essentially becomes the `query impl`
     index_.reset(new RaftIVFFlat(
             resources_.get(),
             quantizer->getGpuData(),
@@ -106,6 +108,38 @@ int RaftIndexIVFFlat::getListLength(int listId) const {
     return index_->getListLength(listId);
 }
 
+void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) {
+    if (n == 0) {
+        // nothing to do
+        return;
+    }
+
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (this->verbose) {
+            printf("IVF quantizer does not need training.\n");
+        }
+
+        return;
+    }
+
+    if (this->verbose) {
+        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
+    }
+
+    DeviceScope scope(config_.device);
+
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    quantizer->reset();
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
+    quantizer->is_trained = true;
+
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+}
+
+
 std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
         int listId,
         bool gpuFormat) const {
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 206738a834..083d0e0eaa 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -70,6 +70,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
     /// Called from GpuIndex for add/add_with_ids
     void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
 
+    void trainQuantizer_impl(Index::idx_t n, const float* x);
+
     /// Called from GpuIndex for search
     void searchImpl_(
             int n,

From 26491cb503e1bd2e96ae8b926f57176556458684 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 14 Jun 2022 17:13:49 -0400
Subject: [PATCH 07/87] iAdding todos where we need to plug in raft
 functionality

---
 faiss/gpu/raft/RaftIVFFlat.cu      | 1 +
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu
index b51a7d9dc0..092f71e3e0 100644
--- a/faiss/gpu/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIVFFlat.cu
@@ -203,6 +203,7 @@ void RaftIVFFlat::query(
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe, dim_});
 
+    // TODO: This is where we invoke the search function from RAFT
     if (useResidual_) {
         // Reconstruct vectors from the quantizer
         quantizer_->reconstruct(coarseIndices, residualBase);
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 7cebf3a8da..97064a7ec2 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -131,6 +131,8 @@ void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) {
     // leverage the CPU-side k-means code, which works for the GPU
     // flat index as well
     quantizer->reset();
+
+    // TODO: Invoke RAFT K-means here
     Clustering clus(this->d, nlist, this->cp);
     clus.verbose = verbose;
     clus.train(n, x, *quantizer);

From b4d08c4250b4c7032245d8672215ac0a1970e517 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 17 Jun 2022 15:11:02 -0400
Subject: [PATCH 08/87] Invocatino of index building has been compiled
 successfully. Still need to propagate kmeans info down into quantized index

---
 cmake/thirdparty/get_raft.cmake    |   4 +-
 faiss/gpu/CMakeLists.txt           |   2 +-
 faiss/gpu/impl/IVFFlat.cuh         |   2 +-
 faiss/gpu/raft/RaftIVFFlat.cu      |  18 ++++-
 faiss/gpu/raft/RaftIVFFlat.cuh     |   2 +-
 faiss/gpu/raft/RaftIndexFlat.cu    | 116 +++++++++++++++++++++++++++++
 faiss/gpu/raft/RaftIndexFlat.h     | 101 +++++++++++++++++++++++++
 faiss/gpu/raft/RaftIndexIVFFlat.cu |  73 ++++++++++--------
 faiss/gpu/raft/RaftIndexIVFFlat.h  |   5 ++
 9 files changed, 285 insertions(+), 38 deletions(-)
 create mode 100644 faiss/gpu/raft/RaftIndexFlat.cu
 create mode 100644 faiss/gpu/raft/RaftIndexFlat.h

diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index b171570137..3fc2d9ae34 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -16,8 +16,8 @@
 
 
 set(RAFT_VERSION "22.04")
-set(RAFT_FORK "rapidsai")
-set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
+set(RAFT_FORK "achirkin")
+set(RAFT_PINNED_TAG "fea-knn-ivf-flat")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 8a3341d094..3135443179 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -183,5 +183,5 @@ endforeach()
 find_package(CUDAToolkit REQUIRED)
 target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>)
 target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas)
-target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
+target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
 target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 23bbf6cff9..d2b43a9c70 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -32,7 +32,7 @@ class IVFFlat : public IVFBase {
 
     /// Find the approximate k nearest neigbors for `queries` against
     /// our database
-    void query(
+    virtual void query(
             Tensor<float, 2, true>& queries,
             int nprobe,
             int k,
diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu
index 092f71e3e0..0e0cead397 100644
--- a/faiss/gpu/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIVFFlat.cu
@@ -21,6 +21,10 @@
 #include <faiss/gpu/utils/Float16.cuh>
 #include <faiss/gpu/utils/HostTensor.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
+
+#include <raft/spatial/knn/ann_common.h>
+#include <raft/spatial/knn/ann.cuh>
+
 #include <limits>
 #include <unordered_map>
 
@@ -164,9 +168,19 @@ void RaftIVFFlat::query(
         int nprobe,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<Index::idx_t, 2, true>& outIndices)  {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
+    // TODO: This is where we invoke the search function from RAFT
+    /**
+     * template <typename T>
+       void cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dim]
+                                          uint32_t n_queries,
+                                          uint32_t k,
+                                          size_t* neighbors,  // [numQueries, topK]
+                                          float* distances)
+     */
+
     // These are caught at a higher level
     FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
@@ -203,7 +217,7 @@ void RaftIVFFlat::query(
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe, dim_});
 
-    // TODO: This is where we invoke the search function from RAFT
+
     if (useResidual_) {
         // Reconstruct vectors from the quantizer
         quantizer_->reconstruct(coarseIndices, residualBase);
diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh
index 700a30cc3c..4340620639 100644
--- a/faiss/gpu/raft/RaftIVFFlat.cuh
+++ b/faiss/gpu/raft/RaftIVFFlat.cuh
@@ -38,7 +38,7 @@ class RaftIVFFlat : public IVFFlat {
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices);
+            Tensor<Index::idx_t, 2, true>& outIndices) override;
 
    protected:
     /// Returns the number of bytes in which an IVF list containing numVecs
diff --git a/faiss/gpu/raft/RaftIndexFlat.cu b/faiss/gpu/raft/RaftIndexFlat.cu
new file mode 100644
index 0000000000..1c323738c4
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexFlat.cu
@@ -0,0 +1,116 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <limits>
+
+namespace faiss {
+namespace gpu {
+
+//
+// RaftIndexFlatL2
+//
+
+RaftIndexFlatL2::RaftIndexFlatL2(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+RaftIndexFlatL2::RaftIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+RaftIndexFlatL2::RaftIndexFlatL2(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {}
+
+RaftIndexFlatL2::RaftIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {}
+
+void RaftIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a RaftIndexFlatL2 from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void RaftIndexFlatL2::copyTo(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a RaftIndexFlatL2 to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+//
+// RaftIndexFlatIP
+//
+
+RaftIndexFlatIP::RaftIndexFlatIP(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+RaftIndexFlatIP::RaftIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+RaftIndexFlatIP::RaftIndexFlatIP(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+RaftIndexFlatIP::RaftIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+void RaftIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a RaftIndexFlatIP from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void RaftIndexFlatIP::copyTo(faiss::IndexFlat* index) {
+    // The passed in index must be IP
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a RaftIndexFlatIP to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexFlat.h b/faiss/gpu/raft/RaftIndexFlat.h
new file mode 100644
index 0000000000..1aa4a51070
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexFlat.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndex.h>
+#include <memory>
+
+namespace faiss {
+
+struct IndexFlat;
+struct IndexFlatL2;
+struct IndexFlatIP;
+
+} // namespace faiss
+
+namespace faiss {
+namespace gpu {
+
+class FlatIndex;
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatL2; copies over centroid data from a given
+/// faiss::IndexFlat
+class RaftIndexFlatL2 : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
+    /// data over to the given GPU
+    RaftIndexFlatL2(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    RaftIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    RaftIndexFlatL2(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    RaftIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatIP; copies over centroid data from a given
+/// faiss::IndexFlat
+class RaftIndexFlatIP : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
+    /// data over to the given GPU
+    RaftIndexFlatIP(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    RaftIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    RaftIndexFlatIP(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    RaftIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 97064a7ec2..1edaac1b87 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -17,6 +17,8 @@
 #include <faiss/gpu/utils/Float16.cuh>
 
 #include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ann_common.h>
+#include <raft/spatial/knn/ann.cuh>
 
 #include <limits>
 
@@ -69,36 +71,45 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
-    // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
-    auto hostData = toHost<float, 2>(
-            (float*)x,
-            resources_->getDefaultStream(config_.device),
-            {(int)n, (int)this->d});
-
-    // TODO: I think this can be done on GPU through RAFT k-means
-    trainQuantizer_impl(n, hostData.data());
-
-    // The quantizer is now trained; construct the IVF index
-
-    // TODO: The underlying RaftIVFFlat essentially becomes the `query impl`
-    index_.reset(new RaftIVFFlat(
-            resources_.get(),
-            quantizer->getGpuData(),
-            this->metric_type,
-            this->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace));
-
-    if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    this->is_trained = true;
+    raft::spatial::knn::ivf_flat_params raft_idx_params;
+    raft_idx_params.nlist = nlist;
+
+    raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
+    raft::spatial::knn::approx_knn_build_index(handle, &raft_knn_index, &raft_idx_params, metric, 0.0f,
+                                               const_cast<float*>(x), n, (faiss::Index::idx_t)d);
+
+//    // FIXME: GPUize more of this
+//    // First, make sure that the data is resident on the CPU, if it is not on
+//    // the CPU, as we depend upon parts of the CPU code
+//    auto hostData = toHost<float, 2>(
+//            (float*)x,
+//            resources_->getDefaultStream(config_.device),
+//            {(int)n, (int)this->d});
+//
+//    // TODO: I think this can be done on GPU through RAFT k-means
+//    trainQuantizer_impl(n, hostData.data());
+//
+//    // The quantizer is now trained; construct the IVF index
+//
+//    // TODO: The underlying RaftIVFFlat essentially becomes the `query impl`
+//    index_.reset(new RaftIVFFlat(
+//            resources_.get(),
+//
+//            // TODO: getGpuData returns a `FlatIndex`
+//            quantizer->getGpuData(),
+//            this->metric_type,
+//            this->metric_arg,
+//            false,   // no residual
+//            nullptr, // no scalar quantizer
+//            ivfFlatConfig_.interleavedLayout,
+//            ivfFlatConfig_.indicesOptions,
+//            config_.memorySpace));
+//
+//    if (reserveMemoryVecs_) {
+//        index_->reserveMemory(reserveMemoryVecs_);
+//    }
+//
+//    this->is_trained = true;
 }
 
 int RaftIndexIVFFlat::getListLength(int listId) const {
@@ -132,7 +143,7 @@ void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) {
     // flat index as well
     quantizer->reset();
 
-    // TODO: Invoke RAFT K-means here
+    // TODO: Invoke RAFT K-means here and set resulting trained data on quantizer
     Clustering clus(this->d, nlist, this->cp);
     clus.verbose = verbose;
     clus.train(n, x, *quantizer);
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 083d0e0eaa..dcc28037d6 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -9,6 +9,9 @@
 
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
+
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ann_common.h>
 #include <memory>
 
 namespace faiss {
@@ -80,6 +83,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             float* distances,
             Index::idx_t* labels) const override;
 
+    const raft::handle_t handle;
+    raft::spatial::knn::knnIndex raft_knn_index;
 };
 
 } // namespace gpu

From bf876f914e4cd10a9e2cf250eaef556e8ede67ed Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 17 Jun 2022 18:03:47 -0400
Subject: [PATCH 09/87] Adding call to search.

---
 faiss/gpu/raft/RaftIVFFlat.cu      |  2 --
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 20 +++++++++++++++-----
 faiss/gpu/raft/RaftIndexIVFFlat.h  |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu
index 0e0cead397..4f74804ea9 100644
--- a/faiss/gpu/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIVFFlat.cu
@@ -22,8 +22,6 @@
 #include <faiss/gpu/utils/HostTensor.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
 
-#include <raft/spatial/knn/ann_common.h>
-#include <raft/spatial/knn/ann.cuh>
 
 #include <limits>
 #include <unordered_map>
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 1edaac1b87..021f3aaeae 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -16,8 +16,6 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
-#include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ann_common.h>
 #include <raft/spatial/knn/ann.cuh>
 
 #include <limits>
@@ -71,12 +69,14 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
+    // TODO: Populate the rest of the params properly.
     raft::spatial::knn::ivf_flat_params raft_idx_params;
     raft_idx_params.nlist = nlist;
 
     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
-    raft::spatial::knn::approx_knn_build_index(handle, &raft_knn_index, &raft_idx_params, metric, 0.0f,
-                                               const_cast<float*>(x), n, (faiss::Index::idx_t)d);
+    raft::spatial::knn::approx_knn_build_index(
+            raft_handle, &raft_knn_index, &raft_idx_params, metric, 0.0f,
+            const_cast<float*>(x), n, (faiss::Index::idx_t)d);
 
 //    // FIXME: GPUize more of this
 //    // First, make sure that the data is resident on the CPU, if it is not on
@@ -162,6 +162,9 @@ std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
     return index_->getListVectorData(listId, gpuFormat);
 }
 
+void RaftIndexIVFFlat::reset() {
+
+}
 std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
     FAISS_ASSERT(index_);
     DeviceScope scope(config_.device);
@@ -206,7 +209,14 @@ void RaftIndexIVFFlat::searchImpl_(
     Tensor<Index::idx_t, 2, true> outLabels(
             const_cast<Index::idx_t*>(labels), {n, k});
 
-    index_->query(queries, nprobe, k, outDistances, outLabels);
+    // TODO: Populate the rest of the params properly.
+    raft::spatial::knn::ivf_flat_params raft_idx_params;
+    raft_idx_params.nlist = nlist;
+
+    raft::spatial::knn::approx_knn_search(
+            raft_handle, distances, (int64_t*)labels,
+            const_cast<raft::spatial::knn::knnIndex*>(&raft_knn_index),
+            &raft_idx_params, k, const_cast<float*>(x), n);
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index dcc28037d6..07e0528be2 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -83,7 +83,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             float* distances,
             Index::idx_t* labels) const override;
 
-    const raft::handle_t handle;
+    const raft::handle_t raft_handle;
     raft::spatial::knn::knnIndex raft_knn_index;
 };
 

From 9b1fc8422dac50ab46cda5704510e78904d439f2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 1 Jul 2022 16:48:50 -0400
Subject: [PATCH 10/87] Adding stubs for remaining calls that need to be made
 from RAFT side in order for FAISS integration to work successfully.

---
 faiss/gpu/CMakeLists.txt                |   4 +-
 faiss/gpu/raft/RaftIVFFlat.cu           | 280 -----------
 faiss/gpu/raft/RaftIVFFlat.cuh          |  82 ---
 faiss/gpu/raft/RaftIndexFlat.cu         | 116 -----
 faiss/gpu/raft/RaftIndexFlat.h          | 101 ----
 faiss/gpu/raft/RaftIndexIVFFlat.cu      | 273 ++++++----
 faiss/gpu/raft/RaftIndexIVFFlat.h       |  15 +-
 faiss/gpu/raft/RmmGpuResources.hpp      | 636 ++++++++++++++++++++++++
 faiss/gpu/test/CMakeLists.txt           |   7 +-
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 555 +++++++++++++++++++++
 10 files changed, 1378 insertions(+), 691 deletions(-)
 delete mode 100644 faiss/gpu/raft/RaftIVFFlat.cu
 delete mode 100644 faiss/gpu/raft/RaftIVFFlat.cuh
 delete mode 100644 faiss/gpu/raft/RaftIndexFlat.cu
 delete mode 100644 faiss/gpu/raft/RaftIndexFlat.h
 create mode 100644 faiss/gpu/raft/RmmGpuResources.hpp
 create mode 100644 faiss/gpu/test/TestRaftIndexIVFFlat.cpp

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 3135443179..3ed26dca01 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS
 )
 
 if(FAISS_ENABLE_RAFT)
-  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIVFFlat.cuh)
-  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIVFFlat.cu)
+  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h)
+  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu
deleted file mode 100644
index 4f74804ea9..0000000000
--- a/faiss/gpu/raft/RaftIVFFlat.cu
+++ /dev/null
@@ -1,280 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/raft/RaftIVFFlat.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-
-#include <limits>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-RaftIVFFlat::RaftIVFFlat(
-        GpuResources* res,
-        FlatIndex* quantizer,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space) :
-          IVFFlat(res, quantizer, metric, metricArg, useResidual, scalarQ, interleavedLayout, indicesOptions, space) {}
-
-RaftIVFFlat::~RaftIVFFlat() {}
-
-size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
-    if (interleavedLayout_) {
-        // bits per scalar code
-        int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
-
-        // bytes to encode a block of 32 vectors (single dimension)
-        int bytesPerDimBlock = bits * 32 / 8;
-
-        // bytes to fully encode 32 vectors
-        int bytesPerBlock = bytesPerDimBlock * dim_;
-
-        // number of blocks of 32 vectors we have
-        int numBlocks = utils::divUp(numVecs, 32);
-
-        // total size to encode numVecs
-        return bytesPerBlock * numBlocks;
-    } else {
-        size_t sizePerVector =
-                (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-        return (size_t)numVecs * sizePerVector;
-    }
-}
-
-size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const {
-    size_t sizePerVector =
-            (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-    return (size_t)numVecs * sizePerVector;
-}
-
-std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
-        std::vector<uint8_t> codes,
-        size_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up =
-            unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
-
-std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
-        std::vector<uint8_t> codes,
-        size_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
-
-void RaftIVFFlat::appendVectors_(
-        Tensor<float, 2, true>& vecs,
-        Tensor<Index::idx_t, 1, true>& indices,
-        Tensor<int, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
-        Tensor<int, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        cudaStream_t stream) {
-    //
-    // Append the new encodings
-    //
-
-    // Calculate residuals for these vectors, if needed
-    DeviceTensor<float, 2, true> residuals(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {vecs.getSize(0), dim_});
-
-    if (useResidual_) {
-        quantizer_->computeResidual(vecs, listIds, residuals);
-    }
-
-    // Append indices to the IVF lists
-    runIVFIndicesAppend(
-            listIds,
-            listOffset,
-            indices,
-            indicesOptions_,
-            deviceListIndexPointers_,
-            stream);
-
-    // Append the encoded vectors to the IVF lists
-    if (interleavedLayout_) {
-        runIVFFlatInterleavedAppend(
-                listIds,
-                listOffset,
-                uniqueLists,
-                vectorsByUniqueList,
-                uniqueListVectorStart,
-                uniqueListStartOffset,
-                useResidual_ ? residuals : vecs,
-                scalarQ_.get(),
-                deviceListDataPointers_,
-                resources_,
-                stream);
-    } else {
-        runIVFFlatAppend(
-                listIds,
-                listOffset,
-                useResidual_ ? residuals : vecs,
-                scalarQ_.get(),
-                deviceListDataPointers_,
-                stream);
-    }
-}
-
-void RaftIVFFlat::query(
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices)  {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // TODO: This is where we invoke the search function from RAFT
-    /**
-     * template <typename T>
-       void cuivflHandle<T>::cuivflSearch(const T* queries,  // [numQueries, dim]
-                                          uint32_t n_queries,
-                                          uint32_t k,
-                                          size_t* neighbors,  // [numQueries, topK]
-                                          float* distances)
-     */
-
-    // These are caught at a higher level
-    FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
-    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-    nprobe = std::min(nprobe, quantizer_->getSize());
-
-    FAISS_ASSERT(queries.getSize(1) == dim_);
-
-    FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
-    FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
-
-    // Reserve space for the quantized information
-    DeviceTensor<float, 2, true> coarseDistances(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-    DeviceTensor<int, 2, true> coarseIndices(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe});
-
-    // Find the `nprobe` closest lists; we can use int indices both
-    // internally and externally
-    quantizer_->query(
-            queries,
-            nprobe,
-            metric_,
-            metricArg_,
-            coarseDistances,
-            coarseIndices,
-            false);
-
-    DeviceTensor<float, 3, true> residualBase(
-            resources_,
-            makeTempAlloc(AllocType::Other, stream),
-            {queries.getSize(0), nprobe, dim_});
-
-
-    if (useResidual_) {
-        // Reconstruct vectors from the quantizer
-        quantizer_->reconstruct(coarseIndices, residualBase);
-    }
-
-    if (interleavedLayout_) {
-        runIVFInterleavedScan(
-                queries,
-                coarseIndices,
-                deviceListDataPointers_,
-                deviceListIndexPointers_,
-                indicesOptions_,
-                deviceListLengths_,
-                k,
-                metric_,
-                useResidual_,
-                residualBase,
-                scalarQ_.get(),
-                outDistances,
-                outIndices,
-                resources_);
-    } else {
-        runIVFFlatScan(
-                queries,
-                coarseIndices,
-                deviceListDataPointers_,
-                deviceListIndexPointers_,
-                indicesOptions_,
-                deviceListLengths_,
-                maxListLength_,
-                k,
-                metric_,
-                useResidual_,
-                residualBase,
-                scalarQ_.get(),
-                outDistances,
-                outIndices,
-                resources_);
-    }
-
-    // If the GPU isn't storing indices (they are on the CPU side), we
-    // need to perform the re-mapping here
-    // FIXME: we might ultimately be calling this function with inputs
-    // from the CPU, these are unnecessary copies
-    if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
-
-        ivfOffsetToUserIndex(
-                hostOutIndices.data(),
-                numLists_,
-                hostOutIndices.getSize(0),
-                hostOutIndices.getSize(1),
-                listOffsetToUserIndex_);
-
-        // Copy back to GPU, since the input to this function is on the
-        // GPU
-        outIndices.copyFrom(hostOutIndices, stream);
-    }
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh
deleted file mode 100644
index 4340620639..0000000000
--- a/faiss/gpu/raft/RaftIVFFlat.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-
-namespace faiss {
-namespace gpu {
-
-class RaftIVFFlat : public IVFFlat {
-   public:
-    /// Construct from a quantizer that has elemen
-    RaftIVFFlat(
-            GpuResources* resources,
-            /// We do not own this reference
-            FlatIndex* quantizer,
-            faiss::MetricType metric,
-            float metricArg,
-            bool useResidual,
-            /// Optional ScalarQuantizer
-            faiss::ScalarQuantizer* scalarQ,
-            bool interleavedLayout,
-            IndicesOptions indicesOptions,
-            MemorySpace space);
-
-    ~RaftIVFFlat() override;
-
-    /// Find the approximate k nearest neighbors for `queries` against
-    /// our database
-    void query(
-            Tensor<float, 2, true>& queries,
-            int nprobe,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices) override;
-
-   protected:
-    /// Returns the number of bytes in which an IVF list containing numVecs
-    /// vectors is encoded on the device. Note that due to padding this is not
-    /// the same as the encoding size for a subset of vectors in an IVF list;
-    /// this is the size for an entire IVF list
-    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
-
-    /// Translate to our preferred GPU encoding
-    std::vector<uint8_t> translateCodesToGpu_(
-            std::vector<uint8_t> codes,
-            size_t numVecs) const override;
-
-    /// Translate from our preferred GPU encoding
-    std::vector<uint8_t> translateCodesFromGpu_(
-            std::vector<uint8_t> codes,
-            size_t numVecs) const override;
-
-    /// Encode the vectors that we're adding and append to our IVF lists
-    void appendVectors_(
-            Tensor<float, 2, true>& vecs,
-            Tensor<Index::idx_t, 1, true>& indices,
-            Tensor<int, 1, true>& uniqueLists,
-            Tensor<int, 1, true>& vectorsByUniqueList,
-            Tensor<int, 1, true>& uniqueListVectorStart,
-            Tensor<int, 1, true>& uniqueListStartOffset,
-            Tensor<int, 1, true>& listIds,
-            Tensor<int, 1, true>& listOffset,
-            cudaStream_t stream) override;
-
-   protected:
-    /// Do we encode the residual from a coarse quantizer or not?
-    bool useResidual_;
-
-    /// Scalar quantizer for encoded vectors, if any
-    std::unique_ptr<GpuScalarQuantizer> scalarQ_;
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexFlat.cu b/faiss/gpu/raft/RaftIndexFlat.cu
deleted file mode 100644
index 1c323738c4..0000000000
--- a/faiss/gpu/raft/RaftIndexFlat.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-//
-// RaftIndexFlatL2
-//
-
-RaftIndexFlatL2::RaftIndexFlatL2(
-        GpuResourcesProvider* provider,
-        faiss::IndexFlatL2* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, index, config) {}
-
-RaftIndexFlatL2::RaftIndexFlatL2(
-        std::shared_ptr<GpuResources> resources,
-        faiss::IndexFlatL2* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, index, config) {}
-
-RaftIndexFlatL2::RaftIndexFlatL2(
-        GpuResourcesProvider* provider,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {}
-
-RaftIndexFlatL2::RaftIndexFlatL2(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {}
-
-void RaftIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a RaftIndexFlatL2 from an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyFrom(index);
-}
-
-void RaftIndexFlatL2::copyTo(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a RaftIndexFlatL2 to an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyTo(index);
-}
-
-//
-// RaftIndexFlatIP
-//
-
-RaftIndexFlatIP::RaftIndexFlatIP(
-        GpuResourcesProvider* provider,
-        faiss::IndexFlatIP* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, index, config) {}
-
-RaftIndexFlatIP::RaftIndexFlatIP(
-        std::shared_ptr<GpuResources> resources,
-        faiss::IndexFlatIP* index,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, index, config) {}
-
-RaftIndexFlatIP::RaftIndexFlatIP(
-        GpuResourcesProvider* provider,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {}
-
-RaftIndexFlatIP::RaftIndexFlatIP(
-        std::shared_ptr<GpuResources> resources,
-        int dims,
-        GpuIndexFlatConfig config)
-        : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {}
-
-void RaftIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a RaftIndexFlatIP from an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyFrom(index);
-}
-
-void RaftIndexFlatIP::copyTo(faiss::IndexFlat* index) {
-    // The passed in index must be IP
-    FAISS_THROW_IF_NOT_MSG(
-            index->metric_type == metric_type,
-            "Cannot copy a RaftIndexFlatIP to an index of "
-            "different metric_type");
-
-    GpuIndexFlat::copyTo(index);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexFlat.h b/faiss/gpu/raft/RaftIndexFlat.h
deleted file mode 100644
index 1aa4a51070..0000000000
--- a/faiss/gpu/raft/RaftIndexFlat.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndex.h>
-#include <memory>
-
-namespace faiss {
-
-struct IndexFlat;
-struct IndexFlatL2;
-struct IndexFlatIP;
-
-} // namespace faiss
-
-namespace faiss {
-namespace gpu {
-
-class FlatIndex;
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexFlatL2; copies over centroid data from a given
-/// faiss::IndexFlat
-class RaftIndexFlatL2 : public GpuIndexFlat {
-   public:
-    /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
-    /// data over to the given GPU
-    RaftIndexFlatL2(
-            GpuResourcesProvider* provider,
-            faiss::IndexFlatL2* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    RaftIndexFlatL2(
-            std::shared_ptr<GpuResources> resources,
-            faiss::IndexFlatL2* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    RaftIndexFlatL2(
-            GpuResourcesProvider* provider,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    RaftIndexFlatL2(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(faiss::IndexFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexFlat* index);
-};
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::IndexFlatIP; copies over centroid data from a given
-/// faiss::IndexFlat
-class RaftIndexFlatIP : public GpuIndexFlat {
-   public:
-    /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
-    /// data over to the given GPU
-    RaftIndexFlatIP(
-            GpuResourcesProvider* provider,
-            faiss::IndexFlatIP* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    RaftIndexFlatIP(
-            std::shared_ptr<GpuResources> resources,
-            faiss::IndexFlatIP* index,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Construct an empty instance that can be added to
-    RaftIndexFlatIP(
-            GpuResourcesProvider* provider,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    RaftIndexFlatIP(
-            std::shared_ptr<GpuResources> resources,
-            int dims,
-            GpuIndexFlatConfig config = GpuIndexFlatConfig());
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(faiss::IndexFlat* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexFlat* index);
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 021f3aaeae..ffc0b3e2c4 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -12,11 +12,10 @@
 #include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/raft/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
-#include <raft/spatial/knn/ann.cuh>
+#include <raft/spatial/knn/ivf_flat.cuh>
 
 #include <limits>
 
@@ -30,7 +29,8 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         : GpuIndexIVFFlat(
         provider,
         index,
-        config) {
+        config), raft_handle(resources_->getDefaultStream(config_.device)) {
+
     copyFrom(index);
 }
 
@@ -40,136 +40,181 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         int nlist,
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
-        : GpuIndexIVFFlat(provider, dims, nlist, metric, config) {
-    // faiss::Index params
-    this->is_trained = false;
+        : GpuIndexIVFFlat(provider, dims, nlist, metric, config),
+          raft_handle(resources_->getDefaultStream(config_.device)) {
 
-    // We haven't trained ourselves, so don't construct the IVFFlat
-    // index yet
+    this->is_trained = false;
 }
 
 RaftIndexIVFFlat::~RaftIndexIVFFlat() {}
 
+void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
-void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
-    // For now, only support <= max int results
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices",
-            std::numeric_limits<int>::max());
+    printf("Copying from...\n");
 
+    // TODO: Need to copy necessary memory from the index and set any needed params.
     DeviceScope scope(config_.device);
 
-    if (this->is_trained) {
-        FAISS_ASSERT(quantizer->is_trained);
-        FAISS_ASSERT(quantizer->ntotal == nlist);
-        FAISS_ASSERT(index_);
+    GpuIndex::copyFrom(index);
+
+    FAISS_ASSERT(index->nlist > 0);
+    FAISS_THROW_IF_NOT_FMT(
+            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports %zu inverted lists",
+            (size_t)std::numeric_limits<int>::max());
+    nlist = index->nlist;
+
+    FAISS_THROW_IF_NOT_FMT(
+            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
+            "GPU index only supports nprobe <= %zu; passed %zu",
+            (size_t)getMaxKSelection(),
+            index->nprobe);
+    nprobe = index->nprobe;
+
+    config.device = config_.device;
+
+    FAISS_ASSERT(metric_type != faiss::METRIC_L2 &&
+                 metric_type != faiss::METRIC_INNER_PRODUCT);
+
+    if (!index->is_trained) {
+        // copied in GpuIndex::copyFrom
+        FAISS_ASSERT(!is_trained && ntotal == 0);
         return;
     }
 
-    FAISS_ASSERT(!index_);
+    // copied in GpuIndex::copyFrom
+    // ntotal can exceed max int, but the number of vectors per inverted
+    // list cannot exceed this. We check this in the subclasses.
+    FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
 
-    // TODO: Populate the rest of the params properly.
-    raft::spatial::knn::ivf_flat_params raft_idx_params;
-    raft_idx_params.nlist = nlist;
-
-    raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded;
-    raft::spatial::knn::approx_knn_build_index(
-            raft_handle, &raft_knn_index, &raft_idx_params, metric, 0.0f,
-            const_cast<float*>(x), n, (faiss::Index::idx_t)d);
-
-//    // FIXME: GPUize more of this
-//    // First, make sure that the data is resident on the CPU, if it is not on
-//    // the CPU, as we depend upon parts of the CPU code
-//    auto hostData = toHost<float, 2>(
-//            (float*)x,
-//            resources_->getDefaultStream(config_.device),
-//            {(int)n, (int)this->d});
-//
-//    // TODO: I think this can be done on GPU through RAFT k-means
-//    trainQuantizer_impl(n, hostData.data());
-//
-//    // The quantizer is now trained; construct the IVF index
-//
-//    // TODO: The underlying RaftIVFFlat essentially becomes the `query impl`
-//    index_.reset(new RaftIVFFlat(
-//            resources_.get(),
-//
-//            // TODO: getGpuData returns a `FlatIndex`
-//            quantizer->getGpuData(),
-//            this->metric_type,
-//            this->metric_arg,
-//            false,   // no residual
-//            nullptr, // no scalar quantizer
-//            ivfFlatConfig_.interleavedLayout,
-//            ivfFlatConfig_.indicesOptions,
-//            config_.memorySpace));
-//
-//    if (reserveMemoryVecs_) {
-//        index_->reserveMemory(reserveMemoryVecs_);
-//    }
-//
-//    this->is_trained = true;
-}
+    // Since we're trained, the quantizer must have data
+    FAISS_ASSERT(index->quantizer->ntotal > 0);
 
-int RaftIndexIVFFlat::getListLength(int listId) const {
-    FAISS_ASSERT(index_);
-    DeviceScope scope(config_.device);
+    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
+    raft_idx_params.n_lists = nlist;
+    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
 
-    return index_->getListLength(listId);
+    // TODO: Invoke corresponding call on the RAFT side to copy quantizer
+    /**
+     * For example:
+     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index<T>(
+     *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
+     */
 }
 
-void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) {
-    if (n == 0) {
-        // nothing to do
-        return;
+void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
+
+    std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl;
+    reserveMemoryVecs_ = numVecs;
+    if (raft_knn_index.has_value()) {
+        DeviceScope scope(config_.device);
+
+        // TODO: We need to reserve memory on the raft::ivf_flat::index
+        /**
+         * For example:
+         * raft::spatial::knn::ivf_flat::ivf_flat_allocate_ivf_lists(
+         *      raft_handle, *raft_knn_index, numVecs);
+         */
     }
+}
+
+size_t RaftIndexIVFFlat::reclaimMemory() {
+    std::cout << "Reclaiming memory" << std::endl;
 
-    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
-        if (this->verbose) {
-            printf("IVF quantizer does not need training.\n");
-        }
+    // TODO: We need to reclaim memory on the raft::ivf_flat::index
+    /**
+     * For example:
+     * raft::spatial::knn::ivf_flat::ivf_flat_reclaim_ivf_lists(
+     *      raft_handle, *raft_knn_index, numVecs);
+     */
+    return 0;
+}
 
+void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    DeviceScope scope(config_.device);
+
+    if (this->is_trained) {
+        FAISS_ASSERT(raft_knn_index.has_value());
         return;
     }
 
-    if (this->verbose) {
-        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
-    }
+    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
+    raft_idx_params.n_lists = nlist;
+    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
 
-    DeviceScope scope(config_.device);
 
-    // leverage the CPU-side k-means code, which works for the GPU
-    // flat index as well
-    quantizer->reset();
+    // TODO: This should only train the quantizer portion of the index
+    /**
+     * For example:
+     *
+     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index<T>(
+     *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
 
-    // TODO: Invoke RAFT K-means here and set resulting trained data on quantizer
-    Clustering clus(this->d, nlist, this->cp);
-    clus.verbose = verbose;
-    clus.train(n, x, *quantizer);
-    quantizer->is_trained = true;
+     * raft::spatial::knn::ivf_flat::ivf_flat_train_quantizer(
+     *      raft_handle, *raft_knn_index, const_cast<float*>(x), n);
+     */
 
-    FAISS_ASSERT(quantizer->ntotal == nlist);
+    raft_knn_index.emplace(
+        raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
+                                            const_cast<float*>(x),
+                                            n, (faiss::Index::idx_t)d,
+                                            raft_handle.get_stream()));
+
+    raft_handle.sync_stream();
 }
 
+int RaftIndexIVFFlat::getListLength(int listId) const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    DeviceScope scope(config_.device);
+
+    // TODO: Call function in RAFT to do this.
+    /**
+     * For example:
+     * raft::spatial::knn::ivf_flat::get_list_length(
+     *    raft_handle, *raft_knn_index, listId);
+     */
+    return 0;
+}
 
 std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
         int listId,
         bool gpuFormat) const {
-    FAISS_ASSERT(index_);
+    FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    return index_->getListVectorData(listId, gpuFormat);
+    // TODO: Invoke corresponding call in raft::ivf_flat
+    /**
+     * For example:
+     * raft::spatial::knn::ivf_flat::get_list_vector_data(
+     *    raft_handle, *raft_knn_index, listId, gpuFormat);
+     */
+    std::vector<uint8_t> vec;
+    return vec;
 }
 
 void RaftIndexIVFFlat::reset() {
-
+    std::cout << "Calling reset()" << std::endl;
+    raft_knn_index.reset();
 }
+
 std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
-    FAISS_ASSERT(index_);
+    FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    return index_->getListIndices(listId);
+    // TODO: Need to invoke corresponding call in raft::ivf_flat
+    /**
+     * For example:
+     * raft::spatial::knn::ivf_flat::get_list_indices(
+     *    raft_handle, *raft_knn_index, listId);
+     */
+    std::vector<Index::idx_t> vec;
+    return vec;
 }
 
 void RaftIndexIVFFlat::addImpl_(
@@ -177,19 +222,29 @@ void RaftIndexIVFFlat::addImpl_(
         const float* x,
         const Index::idx_t* xids) {
     // Device is already set in GpuIndex::add
-    FAISS_ASSERT(index_);
+    FAISS_ASSERT(raft_knn_index.has_value());
     FAISS_ASSERT(n > 0);
 
-    // Data is already resident on the GPU
+      // Data is already resident on the GPU
     Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
     Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
 
-    // Not all vectors may be able to be added (some may contain NaNs etc)
-    index_->addVectors(data, labels);
-
-    // but keep the ntotal based on the total number of vectors that we
-    // attempted to add
+//    // Not all vectors may be able to be added (some may contain NaNs etc)
+//    index_->addVectors(data, labels);
+//
+//    // but keep the ntotal based on the total number of vectors that we
+//    // attempted to add
     ntotal += n;
+
+    std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
+
+    // TODO: Invoke corresponding call in raft::ivf_flat
+    /**
+     * For example:
+     * raft::spatial::knn::ivf_flat::ivf_flat_add_vectors(
+     *      raft_handle, *raft_knn_index, n, x, xids);
+     */
+
 }
 
 void RaftIndexIVFFlat::searchImpl_(
@@ -199,7 +254,7 @@ void RaftIndexIVFFlat::searchImpl_(
         float* distances,
         Index::idx_t* labels) const {
     // Device is already set in GpuIndex::search
-    FAISS_ASSERT(index_);
+    FAISS_ASSERT(raft_knn_index.has_value());
     FAISS_ASSERT(n > 0);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
 
@@ -210,13 +265,19 @@ void RaftIndexIVFFlat::searchImpl_(
             const_cast<Index::idx_t*>(labels), {n, k});
 
     // TODO: Populate the rest of the params properly.
-    raft::spatial::knn::ivf_flat_params raft_idx_params;
-    raft_idx_params.nlist = nlist;
-
-    raft::spatial::knn::approx_knn_search(
-            raft_handle, distances, (int64_t*)labels,
-            const_cast<raft::spatial::knn::knnIndex*>(&raft_knn_index),
-            &raft_idx_params, k, const_cast<float*>(x), n);
+    raft::spatial::knn::ivf_flat::search_params raft_idx_params;
+    raft_idx_params.n_probes = nprobe;
+
+    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(raft_handle,
+                                         raft_idx_params,
+                                         *raft_knn_index,
+                                         const_cast<float*>(x),
+                                         static_cast<std::uint32_t>(n),
+                                         static_cast<std::uint32_t>(k),
+                                         static_cast<faiss::Index::idx_t *>(labels),
+                                         distances, raft_handle.get_stream());
+
+    raft_handle.sync_stream();
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 07e0528be2..4960fa3ae1 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -11,7 +11,8 @@
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 
 #include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ann_common.h>
+#include <raft/spatial/knn/ivf_flat_types.hpp>
+
 #include <memory>
 
 namespace faiss {
@@ -56,6 +57,15 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
     /// Returns the number of vectors present in a particular inverted list
     int getListLength(int listId) const override;
 
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(size_t numVecs);
+
+    /// After adding vectors, one can call this to reclaim device memory
+    /// to exactly the amount needed. Returns space reclaimed in bytes
+    size_t reclaimMemory();
+
+    void copyFrom(const faiss::IndexIVFFlat* index);
+
     /// Return the encoded vector data contained in a particular inverted list,
     /// for debugging purposes.
     /// If gpuFormat is true, the data is returned as it is encoded in the
@@ -73,7 +83,6 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
     /// Called from GpuIndex for add/add_with_ids
     void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
 
-    void trainQuantizer_impl(Index::idx_t n, const float* x);
 
     /// Called from GpuIndex for search
     void searchImpl_(
@@ -84,7 +93,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             Index::idx_t* labels) const override;
 
     const raft::handle_t raft_handle;
-    raft::spatial::knn::knnIndex raft_knn_index;
+    std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/raft/RmmGpuResources.hpp b/faiss/gpu/raft/RmmGpuResources.hpp
new file mode 100644
index 0000000000..e3bc306729
--- /dev/null
+++ b/faiss/gpu/raft/RmmGpuResources.hpp
@@ -0,0 +1,636 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+This code contains unnecessary code duplication. These could be deleted
+once the relevant changes would be made on the FAISS side. Indeed most of
+the logic in the below code is similar to FAISS's standard implementation
+and should thus be inherited instead of duplicated. This FAISS's issue
+once solved should allow the removal of the unnecessary duplicates
+in this file : https://github.com/facebookresearch/faiss/issues/2097
+*/
+
+#pragma once
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StackDeviceMemory.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace faiss {
+namespace gpu {
+
+namespace {
+
+// How many streams per device we allocate by default (for multi-streaming)
+constexpr int kNumStreams = 2;
+
+// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 4 GiB memory GPUs
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 8 GiB memory GPUs
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
+
+// Maximum temporary memory allocation for all GPUs
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
+
+std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map)
+{
+    // Produce a sorted list of all outstanding allocations by type
+    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+
+    for (auto& entry : map) {
+        auto& a = entry.second;
+
+        auto it = stats.find(a.type);
+        if (it != stats.end()) {
+            stats[a.type].first++;
+            stats[a.type].second += a.size;
+        } else {
+            stats[a.type] = std::make_pair(1, a.size);
+        }
+    }
+
+    std::stringstream ss;
+    for (auto& entry : stats) {
+        ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first
+           << " allocations, " << entry.second.second << " bytes\n";
+    }
+
+    return ss.str();
+}
+
+}  // namespace
+
+/// RMM implementation of the GpuResources object that provides for a
+/// temporary memory manager
+class RmmGpuResourcesImpl : public GpuResources {
+   public:
+    RmmGpuResourcesImpl()
+            : pinnedMemAlloc_(nullptr),
+              pinnedMemAllocSize_(0),
+            // let the adjustment function determine the memory size for us by passing
+            // in a huge value that will then be adjusted
+              tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
+              pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+              allocLogging_(false),
+              cmr(new rmm::mr::cuda_memory_resource),
+              mmr(new rmm::mr::managed_memory_resource),
+              pmr(new rmm::mr::pinned_memory_resource){};
+
+    ~RmmGpuResourcesImpl()
+    {
+        // The temporary memory allocator has allocated memory through us, so clean
+        // that up before we finish fully de-initializing ourselves
+        tempMemory_.clear();
+
+        // Make sure all allocations have been freed
+        bool allocError = false;
+
+        for (auto& entry : allocs_) {
+            auto& map = entry.second;
+
+            if (!map.empty()) {
+                std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
+                          << "Device " << entry.first << " outstanding allocations:\n";
+                std::cerr << allocsToString(map);
+                allocError = true;
+            }
+        }
+
+        FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
+
+        for (auto& entry : defaultStreams_) {
+            DeviceScope scope(entry.first);
+
+            // We created these streams, so are responsible for destroying them
+            CUDA_VERIFY(cudaStreamDestroy(entry.second));
+        }
+
+        for (auto& entry : alternateStreams_) {
+            DeviceScope scope(entry.first);
+
+            for (auto stream : entry.second) {
+                CUDA_VERIFY(cudaStreamDestroy(stream));
+            }
+        }
+
+        for (auto& entry : asyncCopyStreams_) {
+            DeviceScope scope(entry.first);
+
+            CUDA_VERIFY(cudaStreamDestroy(entry.second));
+        }
+
+        for (auto& entry : blasHandles_) {
+            DeviceScope scope(entry.first);
+
+            auto blasStatus = cublasDestroy(entry.second);
+            FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+        }
+
+        if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); }
+    };
+
+    /// Disable allocation of temporary memory; all temporary memory
+    /// requests will call cudaMalloc / cudaFree at the point of use
+    void noTempMemory() { setTempMemory(0); };
+
+    /// Specify that we wish to use a certain fixed size of memory on
+    /// all devices as temporary memory. This is the upper bound for the GPU
+    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+    /// To avoid any temporary memory allocation, pass 0.
+    void setTempMemory(size_t size)
+    {
+        if (tempMemSize_ != size) {
+            // adjust based on general limits
+            tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+
+            // We need to re-initialize memory resources for all current devices that
+            // have been initialized.
+            // This should be safe to do, even if we are currently running work, because
+            // the cudaFree call that this implies will force-synchronize all GPUs with
+            // the CPU
+            for (auto& p : tempMemory_) {
+                int device = p.first;
+                // Free the existing memory first
+                p.second.reset();
+
+                // Allocate new
+                p.second = std::unique_ptr<StackDeviceMemory>(
+                        new StackDeviceMemory(this,
+                                              p.first,
+                                // adjust for this specific device
+                                              getDefaultTempMemForGPU(device, tempMemSize_)));
+            }
+        }
+    };
+
+    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+    /// transfers
+    void setPinnedMemory(size_t size)
+    {
+        // Should not call this after devices have been initialized
+        FAISS_ASSERT(defaultStreams_.size() == 0);
+        FAISS_ASSERT(!pinnedMemAlloc_);
+
+        pinnedMemSize_ = size;
+    };
+
+    /// Called to change the stream for work ordering. We do not own `stream`;
+    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+    /// up.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    void setDefaultStream(int device, cudaStream_t stream)
+    {
+        if (isInitialized(device)) {
+            // A new series of calls may not be ordered with what was the previous
+            // stream, so if the stream being specified is different, then we need to
+            // ensure ordering between the two (new stream waits on old).
+            auto it                 = userDefaultStreams_.find(device);
+            cudaStream_t prevStream = nullptr;
+
+            if (it != userDefaultStreams_.end()) {
+                prevStream = it->second;
+            } else {
+                FAISS_ASSERT(defaultStreams_.count(device));
+                prevStream = defaultStreams_[device];
+            }
+
+            if (prevStream != stream) { streamWait({stream}, {prevStream}); }
+        }
+
+        userDefaultStreams_[device] = stream;
+    };
+
+    /// Revert the default stream to the original stream managed by this resources
+    /// object, in case someone called `setDefaultStream`.
+    void revertDefaultStream(int device)
+    {
+        if (isInitialized(device)) {
+            auto it = userDefaultStreams_.find(device);
+
+            if (it != userDefaultStreams_.end()) {
+                // There was a user stream set that we need to synchronize against
+                cudaStream_t prevStream = userDefaultStreams_[device];
+
+                FAISS_ASSERT(defaultStreams_.count(device));
+                cudaStream_t newStream = defaultStreams_[device];
+
+                streamWait({newStream}, {prevStream});
+            }
+        }
+
+        userDefaultStreams_.erase(device);
+    };
+
+    /// Returns the stream for the given device on which all Faiss GPU work is
+    /// ordered.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    cudaStream_t getDefaultStream(int device)
+    {
+        initializeForDevice(device);
+
+        auto it = userDefaultStreams_.find(device);
+        if (it != userDefaultStreams_.end()) {
+            // There is a user override stream set
+            return it->second;
+        }
+
+        // Otherwise, our base default stream
+        return defaultStreams_[device];
+    };
+
+    /// Called to change the work ordering streams to the null stream
+    /// for all devices
+    void setDefaultNullStreamAllDevices()
+    {
+        for (int dev = 0; dev < getNumDevices(); ++dev) {
+            setDefaultStream(dev, nullptr);
+        }
+    };
+
+    /// If enabled, will print every GPU memory allocation and deallocation to
+    /// standard output
+    void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
+
+   public:
+    /// Internal system calls
+
+    /// Initialize resources for this device
+    void initializeForDevice(int device)
+    {
+        if (isInitialized(device)) { return; }
+
+        // If this is the first device that we're initializing, create our
+        // pinned memory allocation
+        if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+            pinnedMemAlloc_     = pmr->allocate(pinnedMemSize_);
+            pinnedMemAllocSize_ = pinnedMemSize_;
+        }
+
+        FAISS_ASSERT(device < getNumDevices());
+        DeviceScope scope(device);
+
+        // Make sure that device properties for all devices are cached
+        auto& prop = getDeviceProperties(device);
+
+        // Also check to make sure we meet our minimum compute capability (3.0)
+        FAISS_ASSERT_FMT(prop.major >= 3,
+                         "Device id %d with CC %d.%d not supported, "
+                         "need 3.0+ compute capability",
+                         device,
+                         prop.major,
+                         prop.minor);
+
+        // Create streams
+        cudaStream_t defaultStream = 0;
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
+
+        defaultStreams_[device] = defaultStream;
+
+        cudaStream_t asyncCopyStream = 0;
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
+
+        asyncCopyStreams_[device] = asyncCopyStream;
+
+        std::vector<cudaStream_t> deviceStreams;
+        for (int j = 0; j < kNumStreams; ++j) {
+            cudaStream_t stream = 0;
+            CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+            deviceStreams.push_back(stream);
+        }
+
+        alternateStreams_[device] = std::move(deviceStreams);
+
+        // Create cuBLAS handle
+        cublasHandle_t blasHandle = 0;
+        auto blasStatus           = cublasCreate(&blasHandle);
+        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+        blasHandles_[device] = blasHandle;
+
+        // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+        // rounding down of inputs to f16 (though accumulate in f32) which results in
+        // unacceptable loss of precision in general.
+        // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
+        // a loss of precision.
+#if CUDA_VERSION >= 11000
+        cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+#endif
+
+        FAISS_ASSERT(allocs_.count(device) == 0);
+        allocs_[device] = std::unordered_map<void*, AllocRequest>();
+
+        FAISS_ASSERT(tempMemory_.count(device) == 0);
+        auto mem = std::unique_ptr<StackDeviceMemory>(
+                new StackDeviceMemory(this,
+                                      device,
+                        // adjust for this specific device
+                                      getDefaultTempMemForGPU(device, tempMemSize_)));
+
+        tempMemory_.emplace(device, std::move(mem));
+    };
+
+    cublasHandle_t getBlasHandle(int device)
+    {
+        initializeForDevice(device);
+        return blasHandles_[device];
+    };
+
+    std::vector<cudaStream_t> getAlternateStreams(int device)
+    {
+        initializeForDevice(device);
+        return alternateStreams_[device];
+    };
+
+    /// Allocate non-temporary GPU memory
+    void* allocMemory(const AllocRequest& req)
+    {
+        initializeForDevice(req.device);
+
+        // We don't allocate a placeholder for zero-sized allocations
+        if (req.size == 0) { return nullptr; }
+
+        // Make sure that the allocation is a multiple of 16 bytes for alignment
+        // purposes
+        auto adjReq = req;
+        adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
+
+        void* p = nullptr;
+
+        if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; }
+
+        if (adjReq.space == MemorySpace::Temporary) {
+            // If we don't have enough space in our temporary memory manager, we need
+            // to allocate this request separately
+            auto& tempMem = tempMemory_[adjReq.device];
+
+            if (adjReq.size > tempMem->getSizeAvailable()) {
+                // We need to allocate this ourselves
+                AllocRequest newReq = adjReq;
+                newReq.space        = MemorySpace::Device;
+                newReq.type         = AllocType::TemporaryMemoryOverflow;
+
+                return allocMemory(newReq);
+            }
+
+            // Otherwise, we can handle this locally
+            p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+
+        } else if (adjReq.space == MemorySpace::Device) {
+            p = cmr->allocate(adjReq.size, adjReq.stream);
+        } else if (adjReq.space == MemorySpace::Unified) {
+            p = mmr->allocate(adjReq.size, adjReq.stream);
+        } else {
+            FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
+        }
+
+        allocs_[adjReq.device][p] = adjReq;
+
+        return p;
+    };
+
+    /// Returns a previous allocation
+    void deallocMemory(int device, void* p)
+    {
+        FAISS_ASSERT(isInitialized(device));
+
+        if (!p) { return; }
+
+        auto& a = allocs_[device];
+        auto it = a.find(p);
+        FAISS_ASSERT(it != a.end());
+
+        auto& req = it->second;
+
+        if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
+
+        if (req.space == MemorySpace::Temporary) {
+            tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+        } else if (req.space == MemorySpace::Device) {
+            cmr->deallocate(p, req.size, req.stream);
+        } else if (req.space == MemorySpace::Unified) {
+            mmr->deallocate(p, req.size, req.stream);
+        } else {
+            FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+        }
+
+        a.erase(it);
+    };
+
+    size_t getTempMemoryAvailable(int device) const
+    {
+        FAISS_ASSERT(isInitialized(device));
+
+        auto it = tempMemory_.find(device);
+        FAISS_ASSERT(it != tempMemory_.end());
+
+        return it->second->getSizeAvailable();
+    };
+
+    /// Export a description of memory used for Python
+    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
+    {
+        using AT = std::map<std::string, std::pair<int, size_t>>;
+
+        std::map<int, AT> out;
+
+        for (auto& entry : allocs_) {
+            AT outDevice;
+
+            for (auto& a : entry.second) {
+                auto& v = outDevice[allocTypeToString(a.second.type)];
+                v.first++;
+                v.second += a.second.size;
+            }
+
+            out[entry.first] = std::move(outDevice);
+        }
+
+        return out;
+    };
+
+    std::pair<void*, size_t> getPinnedMemory()
+    {
+        return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+    };
+
+    cudaStream_t getAsyncCopyStream(int device)
+    {
+        initializeForDevice(device);
+        return asyncCopyStreams_[device];
+    };
+
+   private:
+    /// Have GPU resources been initialized for this device yet?
+    bool isInitialized(int device) const
+    {
+        // Use default streams as a marker for whether or not a certain
+        // device has been initialized
+        return defaultStreams_.count(device) != 0;
+    };
+
+    /// Adjust the default temporary memory allocation based on the total GPU
+    /// memory size
+    static size_t getDefaultTempMemForGPU(int device, size_t requested)
+    {
+        auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                     : std::numeric_limits<size_t>::max();
+
+        if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+            // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+
+            if (requested > k4GiBTempMem) { return k4GiBTempMem; }
+        } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+            // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+
+            if (requested > k8GiBTempMem) { return k8GiBTempMem; }
+        } else {
+            // Never use more than 1.5 GiB
+            if (requested > kMaxTempMem) { return kMaxTempMem; }
+        }
+
+        // use whatever lower limit the user requested
+        return requested;
+    };
+
+   private:
+    /// Set of currently outstanding memory allocations per device
+    /// device -> (alloc request, allocated ptr)
+    std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+
+    /// Temporary memory provider, per each device
+    std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
+
+    /// Our default stream that work is ordered on, one per each device
+    std::unordered_map<int, cudaStream_t> defaultStreams_;
+
+    /// This contains particular streams as set by the user for
+    /// ordering, if any
+    std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+
+    /// Other streams we can use, per each device
+    std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
+
+    /// Async copy stream to use for GPU <-> CPU pinned memory copies
+    std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+
+    /// cuBLAS handle for each device
+    std::unordered_map<int, cublasHandle_t> blasHandles_;
+
+    /// Pinned memory allocation for use with this GPU
+    void* pinnedMemAlloc_;
+    size_t pinnedMemAllocSize_;
+
+    /// Another option is to use a specified amount of memory on all
+    /// devices
+    size_t tempMemSize_;
+
+    /// Amount of pinned memory we should allocate
+    size_t pinnedMemSize_;
+
+    /// Whether or not we log every GPU memory allocation and deallocation
+    bool allocLogging_;
+
+    // cuda_memory_resource
+    std::unique_ptr<rmm::mr::device_memory_resource> cmr;
+
+    // managed_memory_resource
+    std::unique_ptr<rmm::mr::device_memory_resource> mmr;
+
+    // pinned_memory_resource
+    std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+};
+
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory.
+/// Internally, the Faiss GPU code uses the instance managed by getResources,
+/// but this is the user-facing object that is internally reference counted.
+class RmmGpuResources : public GpuResourcesProvider {
+   public:
+    RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
+
+    ~RmmGpuResources(){};
+
+    std::shared_ptr<GpuResources> getResources() { return res_; };
+
+    /// Disable allocation of temporary memory; all temporary memory
+    /// requests will call cudaMalloc / cudaFree at the point of use
+    void noTempMemory() { res_->noTempMemory(); };
+
+    /// Specify that we wish to use a certain fixed size of memory on
+    /// all devices as temporary memory. This is the upper bound for the GPU
+    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+    /// To avoid any temporary memory allocation, pass 0.
+    void setTempMemory(size_t size) { res_->setTempMemory(size); };
+
+    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+    /// transfers
+    void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
+
+    /// Called to change the stream for work ordering. We do not own `stream`;
+    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+    /// up.
+    /// We are guaranteed that all Faiss GPU work is ordered with respect to
+    /// this stream upon exit from an index or other Faiss GPU call.
+    void setDefaultStream(int device, cudaStream_t stream)
+    {
+        res_->setDefaultStream(device, stream);
+    };
+
+    /// Revert the default stream to the original stream managed by this resources
+    /// object, in case someone called `setDefaultStream`.
+    void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
+
+    /// Called to change the work ordering streams to the null stream
+    /// for all devices
+    void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); };
+
+    /// Export a description of memory used for Python
+    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
+    {
+        return res_->getMemoryInfo();
+    };
+
+    /// Returns the current default stream
+    cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); };
+
+    /// Returns the current amount of temp memory available
+    size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); };
+
+    /// Synchronize our default stream with the CPU
+    void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); };
+
+    /// If enabled, will print every GPU memory allocation and deallocation to
+    /// standard output
+    void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); };
+
+   private:
+    std::shared_ptr<RmmGpuResourcesImpl> res_;
+};
+
+}  // namespace gpu
+}  // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index def3ef3151..3eb454c95f 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -10,7 +10,7 @@ find_package(CUDAToolkit REQUIRED)
 include(GoogleTest)
 
 add_library(faiss_gpu_test_helper TestUtils.cpp)
-target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart)
+target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>)
 
 macro(faiss_gpu_test file)
   get_filename_component(test_name ${file} NAME_WE)
@@ -29,6 +29,11 @@ faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
 
+
+if(FAISS_ENABLE_RAFT)
+  faiss_gpu_test(TestRaftIndexIVFFlat.cpp)
+endif()
+
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
 
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
new file mode 100644
index 0000000000..1794e9da6d
--- /dev/null
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -0,0 +1,555 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
+#include <faiss/gpu/raft/RmmGpuResources.hpp>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <sstream>
+#include <vector>
+
+// FIXME: figure out a better way to test fp16
+constexpr float kF16MaxRelErr = 0.3f;
+constexpr float kF32MaxRelErr = 0.03f;
+
+struct Options {
+    Options() {
+        numAdd = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(64, 200);
+
+        numCentroids = std::sqrt((float)numAdd / 2);
+        numTrain = numCentroids * 40;
+        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
+        numQuery = faiss::gpu::randVal(32, 100);
+
+        // Due to the approximate nature of the query and of floating point
+        // differences between GPU and CPU, to stay within our error bounds,
+        // only use a small k
+        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
+        indicesOpt = faiss::gpu::randSelect(
+                {faiss::gpu::INDICES_CPU,
+                 faiss::gpu::INDICES_32_BIT,
+                 faiss::gpu::INDICES_64_BIT});
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
+            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
+            << " numQuery " << numQuery << " k " << k << " indicesOpt "
+            << indicesOpt;
+
+        return str.str();
+    }
+
+    int numAdd;
+    int dim;
+    int numCentroids;
+    int numTrain;
+    int nprobe;
+    int numQuery;
+    int k;
+    int device;
+    faiss::gpu::IndicesOptions indicesOpt;
+};
+
+void queryTest(
+        faiss::MetricType metricType,
+        bool useFloat16CoarseQuantizer,
+        int dimOverride = -1) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        faiss::IndexFlatL2 quantizerL2(opt.dim);
+        faiss::IndexFlatIP quantizerIP(opt.dim);
+        faiss::Index* quantizer = metricType == faiss::METRIC_L2
+                                  ? (faiss::Index*)&quantizerL2
+                                  : (faiss::Index*)&quantizerIP;
+
+        faiss::IndexIVFFlat cpuIndex(
+                quantizer, opt.dim, opt.numCentroids, metricType);
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.add(opt.numAdd, addVecs.data());
+        cpuIndex.nprobe = opt.nprobe;
+
+        faiss::gpu::RmmGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexIVFFlatConfig config;
+        config.device = opt.device;
+        config.indicesOptions = opt.indicesOpt;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+        faiss::gpu::RaftIndexIVFFlat gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.setNumProbes(opt.nprobe);
+
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                // FIXME: the fp16 bounds are
+                // useless when math (the accumulator) is
+                // in fp16. Figure out another way to test
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.65f : 0.015f);
+    }
+}
+
+void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        faiss::IndexFlatL2 quantizerL2(opt.dim);
+        faiss::IndexFlatIP quantizerIP(opt.dim);
+        faiss::Index* quantizer = metricType == faiss::METRIC_L2
+                                  ? (faiss::Index*)&quantizerL2
+                                  : (faiss::Index*)&quantizerIP;
+
+        faiss::IndexIVFFlat cpuIndex(
+                quantizer, opt.dim, opt.numCentroids, metricType);
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.nprobe = opt.nprobe;
+
+        faiss::gpu::RmmGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexIVFFlatConfig config;
+        config.device = opt.device;
+        config.indicesOptions = opt.indicesOpt;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+        faiss::gpu::RaftIndexIVFFlat gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.setNumProbes(opt.nprobe);
+
+        cpuIndex.add(opt.numAdd, addVecs.data());
+        gpuIndex.add(opt.numAdd, addVecs.data());
+
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.30f : 0.015f);
+    }
+}
+
+void copyToTest(bool useFloat16CoarseQuantizer) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::RaftIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    // use garbage values to see if we overwrite then
+    faiss::IndexFlatL2 cpuQuantizer(1);
+    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
+    cpuIndex.nprobe = 1;
+
+    gpuIndex.copyTo(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
+}
+
+void copyFromTest(bool useFloat16CoarseQuantizer) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    // use garbage values to see if we overwrite then
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::RaftIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
+    gpuIndex.setNumProbes(1);
+
+    gpuIndex.copyFrom(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
+}
+
+//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
+//addTest(faiss::METRIC_L2, false);
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
+//addTest(faiss::METRIC_INNER_PRODUCT, false);
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
+//addTest(faiss::METRIC_L2, true);
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
+//addTest(faiss::METRIC_INNER_PRODUCT, true);
+//}
+
+//
+// General query tests
+//
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_L2) {
+queryTest(faiss::METRIC_L2, false);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_IP) {
+queryTest(faiss::METRIC_INNER_PRODUCT, false);
+}
+
+// float16 coarse quantizer
+
+TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) {
+queryTest(faiss::METRIC_L2, true);
+}
+
+TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) {
+queryTest(faiss::METRIC_INNER_PRODUCT, true);
+}
+
+//
+// There are IVF list scanning specializations for 64-d and 128-d that we
+// make sure we explicitly test here
+//
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) {
+queryTest(faiss::METRIC_L2, false, 64);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) {
+queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
+queryTest(faiss::METRIC_L2, false, 128);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
+queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+}
+
+//
+// Copy tests
+//
+
+TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) {
+copyToTest(false);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) {
+copyFromTest(false);
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_negative) {
+Options opt;
+
+auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+// Put all vecs on negative side
+for (auto& f : trainVecs) {
+f = std::abs(f) * -1.0f;
+}
+
+for (auto& f : addVecs) {
+f *= std::abs(f) * -1.0f;
+}
+
+faiss::IndexFlatIP quantizerIP(opt.dim);
+faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
+
+faiss::IndexIVFFlat cpuIndex(
+        quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
+cpuIndex.train(opt.numTrain, trainVecs.data());
+cpuIndex.add(opt.numAdd, addVecs.data());
+cpuIndex.nprobe = opt.nprobe;
+
+faiss::gpu::RmmGpuResources res;
+res.noTempMemory();
+
+faiss::gpu::GpuIndexIVFFlatConfig config;
+config.device = opt.device;
+config.indicesOptions = opt.indicesOpt;
+
+faiss::gpu::RaftIndexIVFFlat gpuIndex(
+        &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+gpuIndex.copyFrom(&cpuIndex);
+gpuIndex.setNumProbes(opt.nprobe);
+
+// Construct a positive test set
+auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+// Put all vecs on positive size
+for (auto& f : queryVecs) {
+f = std::abs(f);
+}
+
+bool compFloat16 = false;
+faiss::gpu::compareIndices(
+        queryVecs,
+        cpuIndex,
+        gpuIndex,
+        opt.numQuery,
+opt.dim,
+opt.k,
+opt.toString(),
+        compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+// FIXME: the fp16 bounds are
+// useless when math (the accumulator) is
+// in fp16. Figure out another way to test
+compFloat16 ? 0.99f : 0.1f,
+compFloat16 ? 0.65f : 0.015f);
+}
+
+//
+// NaN tests
+//
+
+TEST(TestRaftIndexIVFFlat, QueryNaN) {
+Options opt;
+
+std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+faiss::gpu::RmmGpuResources res;
+res.noTempMemory();
+
+faiss::gpu::GpuIndexIVFFlatConfig config;
+config.device = opt.device;
+config.indicesOptions = opt.indicesOpt;
+config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+faiss::gpu::RaftIndexIVFFlat gpuIndex(
+        &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+gpuIndex.setNumProbes(opt.nprobe);
+
+gpuIndex.train(opt.numTrain, trainVecs.data());
+gpuIndex.add(opt.numAdd, addVecs.data());
+
+int numQuery = 10;
+std::vector<float> nans(
+        numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+std::vector<float> distances(numQuery * opt.k, 0);
+std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+
+gpuIndex.search(
+        numQuery, nans.data(), opt.k, distances.data(), indices.data());
+
+for (int q = 0; q < numQuery; ++q) {
+for (int k = 0; k < opt.k; ++k) {
+EXPECT_EQ(indices[q * opt.k + k], -1);
+EXPECT_EQ(
+        distances[q * opt.k + k],
+        std::numeric_limits<float>::max());
+}
+}
+}
+
+TEST(TestRaftIndexIVFFlat, AddNaN) {
+Options opt;
+
+faiss::gpu::RmmGpuResources res;
+res.noTempMemory();
+
+faiss::gpu::GpuIndexIVFFlatConfig config;
+config.device = opt.device;
+config.indicesOptions = opt.indicesOpt;
+config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+faiss::gpu::RaftIndexIVFFlat gpuIndex(
+        &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+gpuIndex.setNumProbes(opt.nprobe);
+
+int numNans = 10;
+std::vector<float> nans(
+        numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+// Make one vector valid (not the first vector, in order to test offset
+// issues), which should actually add
+for (int i = 0; i < opt.dim; ++i) {
+nans[opt.dim + i] = i;
+}
+
+std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+gpuIndex.train(opt.numTrain, trainVecs.data());
+
+// should not crash
+EXPECT_EQ(gpuIndex.ntotal, 0);
+gpuIndex.add(numNans, nans.data());
+
+std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+std::vector<float> distance(opt.numQuery * opt.k, 0);
+std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+
+// should not crash
+gpuIndex.search(
+        opt.numQuery,
+queryVecs.data(),
+        opt.k,
+distance.data(),
+        indices.data());
+}
+
+TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
+// Construct on a random device to test multi-device, if we have
+// multiple devices
+int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+return;
+}
+
+int dim = 128;
+
+int numCentroids = 256;
+// Unfortunately it would take forever to add 24 GB in IVFPQ data,
+// so just perform a small test with data allocated in the unified
+// memory address space
+size_t numAdd = 10000;
+size_t numTrain = numCentroids * 40;
+int numQuery = 10;
+int k = 10;
+int nprobe = 8;
+
+std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+
+faiss::IndexFlatL2 quantizer(dim);
+faiss::IndexIVFFlat cpuIndex(
+        &quantizer, dim, numCentroids, faiss::METRIC_L2);
+
+cpuIndex.train(numTrain, trainVecs.data());
+cpuIndex.add(numAdd, addVecs.data());
+cpuIndex.nprobe = nprobe;
+
+faiss::gpu::RmmGpuResources res;
+res.noTempMemory();
+
+faiss::gpu::GpuIndexIVFFlatConfig config;
+config.device = device;
+config.memorySpace = faiss::gpu::MemorySpace::Unified;
+
+faiss::gpu::RaftIndexIVFFlat gpuIndex(
+        &res, dim, numCentroids, faiss::METRIC_L2, config);
+gpuIndex.copyFrom(&cpuIndex);
+gpuIndex.setNumProbes(nprobe);
+
+faiss::gpu::compareIndices(
+        cpuIndex,
+        gpuIndex,
+        numQuery,
+        dim,
+        k,
+"Unified Memory",
+kF32MaxRelErr,
+0.1f,
+0.015f);
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}

From 884bfa5afbcdd38c0178d3f57b80e4996e12813f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 1 Jul 2022 16:55:36 -0400
Subject: [PATCH 11/87] iUpdating function calls for copyFrom to include
 populating the quantizer and ivf lists

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index ffc0b3e2c4..0f6e9bcf99 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -97,7 +97,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     // TODO: Invoke corresponding call on the RAFT side to copy quantizer
     /**
      * For example:
-     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index<T>(
+     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index<T>(
      *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
      */
 }
@@ -112,8 +112,13 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
         // TODO: We need to reserve memory on the raft::ivf_flat::index
         /**
          * For example:
-         * raft::spatial::knn::ivf_flat::ivf_flat_allocate_ivf_lists(
+         * raft::spatial::knn::ivf_flat::allocate_ivf_lists(
          *      raft_handle, *raft_knn_index, numVecs);
+         *
+         * raft::spatial::knn::ivf_flat::populate(
+         *      raft_handle, *raft_knn_index,
+         *      n_centroids, centroids,
+         *      n_vectors, ivf);
          */
     }
 }
@@ -124,7 +129,7 @@ size_t RaftIndexIVFFlat::reclaimMemory() {
     // TODO: We need to reclaim memory on the raft::ivf_flat::index
     /**
      * For example:
-     * raft::spatial::knn::ivf_flat::ivf_flat_reclaim_ivf_lists(
+     * raft::spatial::knn::ivf_flat::reclaim_ivf_lists(
      *      raft_handle, *raft_knn_index, numVecs);
      */
     return 0;
@@ -153,10 +158,10 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     /**
      * For example:
      *
-     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index<T>(
+     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index<T>(
      *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
 
-     * raft::spatial::knn::ivf_flat::ivf_flat_train_quantizer(
+     * raft::spatial::knn::ivf_flat::train_quantizer(
      *      raft_handle, *raft_knn_index, const_cast<float*>(x), n);
      */
 
@@ -241,7 +246,7 @@ void RaftIndexIVFFlat::addImpl_(
     // TODO: Invoke corresponding call in raft::ivf_flat
     /**
      * For example:
-     * raft::spatial::knn::ivf_flat::ivf_flat_add_vectors(
+     * raft::spatial::knn::ivf_flat::add_vectors(
      *      raft_handle, *raft_knn_index, n, x, xids);
      */
 

From 0958d2e47b5c42e8af79ae379eb57b2a68f7fbdf Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 4 Jul 2022 17:29:09 +0200
Subject: [PATCH 12/87] Implement some helpers

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 33 +++++++++++-------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 0f6e9bcf99..5011109491 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -178,13 +178,7 @@ int RaftIndexIVFFlat::getListLength(int listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    // TODO: Call function in RAFT to do this.
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::get_list_length(
-     *    raft_handle, *raft_knn_index, listId);
-     */
-    return 0;
+    return int(raft_knn_index->list_sizes[listId]);
 }
 
 std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
@@ -193,13 +187,13 @@ std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
     FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    // TODO: Invoke corresponding call in raft::ivf_flat
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::get_list_vector_data(
-     *    raft_handle, *raft_knn_index, listId, gpuFormat);
-     */
-    std::vector<uint8_t> vec;
+    using elem_t = decltype(raft_knn_index->data)::element_type;
+    size_t dim = raft_knn_index->dim();
+    size_t byte_offset = size_t(raft_knn_index->list_offsets[listId]) * sizeof(elem_t) * dim;
+    // the interleaved block can be slightly larger than the list size (it's rounded up)
+    size_t byte_size = size_t(raft_knn_index->list_offsets[listId + 1]) * sizeof(elem_t) * dim - byte_offset;
+    std::vector<uint8_t> vec(byte_size);
+    raft::copy(vec.data(), reinterpret_cast<uint8_t*>(raft_knn_index->data.data()) + byte_offset, byte_size);
     return vec;
 }
 
@@ -212,13 +206,10 @@ std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    // TODO: Need to invoke corresponding call in raft::ivf_flat
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::get_list_indices(
-     *    raft_handle, *raft_knn_index, listId);
-     */
-    std::vector<Index::idx_t> vec;
+    size_t offset = raft_knn_index->list_offsets[listId];
+    size_t size = raft_knn_index->list_sizes[listId];
+    std::vector<Index::idx_t> vec(size);
+    raft::copy(vec.data(), raft_knn_index->indices.data() + offset, size);
     return vec;
 }
 

From b7144a99cccb18f3bf883270c9c48d04d86c25c9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 5 Jul 2022 15:00:45 +0200
Subject: [PATCH 13/87] Make it compile

---
 CMakeLists.txt                     |   2 +-
 build.sh                           |  41 ++++++++++++
 faiss/gpu/CMakeLists.txt           |   4 +-
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 104 ++++++++++++++++-------------
 4 files changed, 103 insertions(+), 48 deletions(-)
 create mode 100755 build.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71a05ab7dc..750cba414e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ project(faiss
   LANGUAGES CXX)
 include(GNUInstallDirs)
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..7ff0577e29
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+BUILD_TYPE=Debug
+
+RAFT_REPO_REL="../raft"
+RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+
+set -e
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DFAISS_ENABLE_RAFT=ON \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DCPM_raft_SOURCE="${RAFT_REPO_PATH}" \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DCMAKE_CUDA_ARCHITECTURES="86" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ -B build .
+
+make -C build -j
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 3ed26dca01..f157e6e7ec 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -182,6 +182,6 @@ endforeach()
 
 find_package(CUDAToolkit REQUIRED)
 target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>)
-target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas)
+target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>)
 target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
-target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
+target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 5011109491..d2114ff004 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -9,8 +9,8 @@
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
@@ -26,11 +26,8 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         GpuResourcesProvider* provider,
         const faiss::IndexIVFFlat* index,
         GpuIndexIVFFlatConfig config)
-        : GpuIndexIVFFlat(
-        provider,
-        index,
-        config), raft_handle(resources_->getDefaultStream(config_.device)) {
-
+        : GpuIndexIVFFlat(provider, index, config),
+          raft_handle(resources_->getDefaultStream(config_.device)) {
     copyFrom(index);
 }
 
@@ -42,17 +39,16 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         GpuIndexIVFFlatConfig config)
         : GpuIndexIVFFlat(provider, dims, nlist, metric, config),
           raft_handle(resources_->getDefaultStream(config_.device)) {
-
     this->is_trained = false;
 }
 
 RaftIndexIVFFlat::~RaftIndexIVFFlat() {}
 
 void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-
     printf("Copying from...\n");
 
-    // TODO: Need to copy necessary memory from the index and set any needed params.
+    // TODO: Need to copy necessary memory from the index and set any needed
+    // params.
     DeviceScope scope(config_.device);
 
     GpuIndex::copyFrom(index);
@@ -71,11 +67,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             index->nprobe);
     nprobe = index->nprobe;
 
-    config.device = config_.device;
-
-    FAISS_ASSERT(metric_type != faiss::METRIC_L2 &&
-                 metric_type != faiss::METRIC_INNER_PRODUCT);
-
+    // config.device = config_.device;
     if (!index->is_trained) {
         // copied in GpuIndex::copyFrom
         FAISS_ASSERT(!is_trained && ntotal == 0);
@@ -92,7 +84,17 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     raft::spatial::knn::ivf_flat::index_params raft_idx_params;
     raft_idx_params.n_lists = nlist;
-    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+
+    switch (metric_type) {
+        case faiss::METRIC_L2:
+            raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+            break;
+        case faiss::METRIC_INNER_PRODUCT:
+            raft_idx_params.metric = raft::distance::DistanceType::InnerProduct;
+            break;
+        default:
+            FAISS_THROW_MSG("Metric is not supported.");
+    }
 
     // TODO: Invoke corresponding call on the RAFT side to copy quantizer
     /**
@@ -103,7 +105,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 }
 
 void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
-
     std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl;
     reserveMemoryVecs_ = numVecs;
     if (raft_knn_index.has_value()) {
@@ -153,7 +154,6 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     raft_idx_params.n_lists = nlist;
     raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
 
-
     // TODO: This should only train the quantizer portion of the index
     /**
      * For example:
@@ -165,11 +165,12 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
      *      raft_handle, *raft_knn_index, const_cast<float*>(x), n);
      */
 
-    raft_knn_index.emplace(
-        raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
-                                            const_cast<float*>(x),
-                                            n, (faiss::Index::idx_t)d,
-                                            raft_handle.get_stream()));
+    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build(
+            raft_handle,
+            raft_idx_params,
+            const_cast<float*>(x),
+            n,
+            (faiss::Index::idx_t)d));
 
     raft_handle.sync_stream();
 }
@@ -178,7 +179,7 @@ int RaftIndexIVFFlat::getListLength(int listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    return int(raft_knn_index->list_sizes[listId]);
+    return int(raft_knn_index->list_sizes(listId));
 }
 
 std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
@@ -189,11 +190,20 @@ std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
 
     using elem_t = decltype(raft_knn_index->data)::element_type;
     size_t dim = raft_knn_index->dim();
-    size_t byte_offset = size_t(raft_knn_index->list_offsets[listId]) * sizeof(elem_t) * dim;
-    // the interleaved block can be slightly larger than the list size (it's rounded up)
-    size_t byte_size = size_t(raft_knn_index->list_offsets[listId + 1]) * sizeof(elem_t) * dim - byte_offset;
+    size_t byte_offset =
+            size_t(raft_knn_index->list_offsets(listId)) * sizeof(elem_t) * dim;
+    // the interleaved block can be slightly larger than the list size (it's
+    // rounded up)
+    size_t byte_size = size_t(raft_knn_index->list_offsets(listId + 1)) *
+                    sizeof(elem_t) * dim -
+            byte_offset;
     std::vector<uint8_t> vec(byte_size);
-    raft::copy(vec.data(), reinterpret_cast<uint8_t*>(raft_knn_index->data.data()) + byte_offset, byte_size);
+    raft::copy(
+            vec.data(),
+            reinterpret_cast<const uint8_t*>(raft_knn_index->data.data()) +
+                    byte_offset,
+            byte_size,
+            raft_handle.get_stream());
     return vec;
 }
 
@@ -206,10 +216,14 @@ std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     DeviceScope scope(config_.device);
 
-    size_t offset = raft_knn_index->list_offsets[listId];
-    size_t size = raft_knn_index->list_sizes[listId];
+    size_t offset = raft_knn_index->list_offsets(listId);
+    size_t size = raft_knn_index->list_sizes(listId);
     std::vector<Index::idx_t> vec(size);
-    raft::copy(vec.data(), raft_knn_index->indices.data() + offset, size);
+    raft::copy(
+            vec.data(),
+            raft_knn_index->indices.data() + offset,
+            size,
+            raft_handle.get_stream());
     return vec;
 }
 
@@ -221,15 +235,15 @@ void RaftIndexIVFFlat::addImpl_(
     FAISS_ASSERT(raft_knn_index.has_value());
     FAISS_ASSERT(n > 0);
 
-      // Data is already resident on the GPU
+    // Data is already resident on the GPU
     Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
     Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
 
-//    // Not all vectors may be able to be added (some may contain NaNs etc)
-//    index_->addVectors(data, labels);
-//
-//    // but keep the ntotal based on the total number of vectors that we
-//    // attempted to add
+    //    // Not all vectors may be able to be added (some may contain NaNs etc)
+    //    index_->addVectors(data, labels);
+    //
+    //    // but keep the ntotal based on the total number of vectors that we
+    //    // attempted to add
     ntotal += n;
 
     std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
@@ -240,7 +254,6 @@ void RaftIndexIVFFlat::addImpl_(
      * raft::spatial::knn::ivf_flat::add_vectors(
      *      raft_handle, *raft_knn_index, n, x, xids);
      */
-
 }
 
 void RaftIndexIVFFlat::searchImpl_(
@@ -264,14 +277,15 @@ void RaftIndexIVFFlat::searchImpl_(
     raft::spatial::knn::ivf_flat::search_params raft_idx_params;
     raft_idx_params.n_probes = nprobe;
 
-    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(raft_handle,
-                                         raft_idx_params,
-                                         *raft_knn_index,
-                                         const_cast<float*>(x),
-                                         static_cast<std::uint32_t>(n),
-                                         static_cast<std::uint32_t>(k),
-                                         static_cast<faiss::Index::idx_t *>(labels),
-                                         distances, raft_handle.get_stream());
+    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
+            raft_handle,
+            raft_idx_params,
+            *raft_knn_index,
+            const_cast<float*>(x),
+            static_cast<std::uint32_t>(n),
+            static_cast<std::uint32_t>(k),
+            static_cast<faiss::Index::idx_t*>(labels),
+            distances);
 
     raft_handle.sync_stream();
 }

From 38733bb42c2c1cceb2da63a3671093c13e1d466d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 6 Jul 2022 12:11:05 +0200
Subject: [PATCH 14/87] Make the tests to not crash... sometimes

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu      | 171 ++++------
 faiss/gpu/raft/RaftIndexIVFFlat.h       |   2 +
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 432 ++++++++++++------------
 3 files changed, 296 insertions(+), 309 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index d2114ff004..01c5fc028b 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -38,70 +38,49 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
         : GpuIndexIVFFlat(provider, dims, nlist, metric, config),
-          raft_handle(resources_->getDefaultStream(config_.device)) {
-    this->is_trained = false;
-}
+          raft_handle(resources_->getDefaultStream(config_.device)) {}
 
-RaftIndexIVFFlat::~RaftIndexIVFFlat() {}
+RaftIndexIVFFlat::~RaftIndexIVFFlat() {
+    RaftIndexIVFFlat::reset();
+}
 
 void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    printf("Copying from...\n");
-
-    // TODO: Need to copy necessary memory from the index and set any needed
-    // params.
     DeviceScope scope(config_.device);
-
     GpuIndex::copyFrom(index);
-
     FAISS_ASSERT(index->nlist > 0);
     FAISS_THROW_IF_NOT_FMT(
             index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
             "GPU index only supports %zu inverted lists",
             (size_t)std::numeric_limits<int>::max());
-    nlist = index->nlist;
-
     FAISS_THROW_IF_NOT_FMT(
             index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
             "GPU index only supports nprobe <= %zu; passed %zu",
             (size_t)getMaxKSelection(),
             index->nprobe);
-    nprobe = index->nprobe;
 
-    // config.device = config_.device;
-    if (!index->is_trained) {
-        // copied in GpuIndex::copyFrom
-        FAISS_ASSERT(!is_trained && ntotal == 0);
-        return;
+    if (index->is_trained && index->ntotal > 0) {
+        // TODO: A proper copy of the index without retraining
+        // For now, just get all the data from the index, and train our index
+        // anew.
+        auto stream = raft_handle.get_stream();
+        auto total_elems = size_t(index->ntotal) * size_t(index->d);
+        rmm::device_uvector<float> buf_dev(total_elems, stream);
+        {
+            std::vector<float> buf_host(total_elems);
+            index->reconstruct_n(0, index->ntotal, buf_host.data());
+            raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
+        }
+        FAISS_ASSERT(index->d == this->d);
+        FAISS_ASSERT(index->metric_arg == this->metric_arg);
+        FAISS_ASSERT(index->metric_type == this->metric_type);
+        FAISS_ASSERT(index->nlist == this->nlist);
+        RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), index->ntotal);
+    } else {
+        // index is not trained, so we can remove ours as well (if there was
+        // any)
+        raft_knn_index.reset();
     }
-
-    // copied in GpuIndex::copyFrom
-    // ntotal can exceed max int, but the number of vectors per inverted
-    // list cannot exceed this. We check this in the subclasses.
-    FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
-
-    // Since we're trained, the quantizer must have data
-    FAISS_ASSERT(index->quantizer->ntotal > 0);
-
-    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
-    raft_idx_params.n_lists = nlist;
-
-    switch (metric_type) {
-        case faiss::METRIC_L2:
-            raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-            break;
-        case faiss::METRIC_INNER_PRODUCT:
-            raft_idx_params.metric = raft::distance::DistanceType::InnerProduct;
-            break;
-        default:
-            FAISS_THROW_MSG("Metric is not supported.");
-    }
-
-    // TODO: Invoke corresponding call on the RAFT side to copy quantizer
-    /**
-     * For example:
-     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index<T>(
-     *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
-     */
+    this->is_trained = index->is_trained;
 }
 
 void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
@@ -137,23 +116,8 @@ size_t RaftIndexIVFFlat::reclaimMemory() {
 }
 
 void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
-    // For now, only support <= max int results
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices",
-            std::numeric_limits<int>::max());
-
     DeviceScope scope(config_.device);
 
-    if (this->is_trained) {
-        FAISS_ASSERT(raft_knn_index.has_value());
-        return;
-    }
-
-    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
-    raft_idx_params.n_lists = nlist;
-    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-
     // TODO: This should only train the quantizer portion of the index
     /**
      * For example:
@@ -163,16 +127,11 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
 
      * raft::spatial::knn::ivf_flat::train_quantizer(
      *      raft_handle, *raft_knn_index, const_cast<float*>(x), n);
+     *
+     * NB: ivf_flat does not have a quantizer. Training here imply kmeans?
      */
 
-    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build(
-            raft_handle,
-            raft_idx_params,
-            const_cast<float*>(x),
-            n,
-            (faiss::Index::idx_t)d));
-
-    raft_handle.sync_stream();
+    RaftIndexIVFFlat::rebuildRaftIndex(x, n);
 }
 
 int RaftIndexIVFFlat::getListLength(int listId) const {
@@ -208,8 +167,8 @@ std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
 }
 
 void RaftIndexIVFFlat::reset() {
-    std::cout << "Calling reset()" << std::endl;
     raft_knn_index.reset();
+    this->ntotal = 0;
 }
 
 std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
@@ -232,28 +191,20 @@ void RaftIndexIVFFlat::addImpl_(
         const float* x,
         const Index::idx_t* xids) {
     // Device is already set in GpuIndex::add
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(is_trained);
     FAISS_ASSERT(n > 0);
+    /* TODO:
+      At the moment, raft does not support adding vectors, and does not support
+      providing indices with the vectors even in training
 
-    // Data is already resident on the GPU
-    Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
-    Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
-
-    //    // Not all vectors may be able to be added (some may contain NaNs etc)
-    //    index_->addVectors(data, labels);
-    //
-    //    // but keep the ntotal based on the total number of vectors that we
-    //    // attempted to add
-    ntotal += n;
-
-    std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
-
-    // TODO: Invoke corresponding call in raft::ivf_flat
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::add_vectors(
-     *      raft_handle, *raft_knn_index, n, x, xids);
+      For now, just do the training anew
      */
+    raft_knn_index.reset();
+
+    // Not all vectors may be able to be added (some may contain NaNs etc)
+    // but keep the ntotal based on the total number of vectors that we
+    // attempted to add index_->addVectors(data, labels);
+    RaftIndexIVFFlat::rebuildRaftIndex(x, n);
 }
 
 void RaftIndexIVFFlat::searchImpl_(
@@ -267,28 +218,44 @@ void RaftIndexIVFFlat::searchImpl_(
     FAISS_ASSERT(n > 0);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
 
-    // Data is already resident on the GPU
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
-    Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<Index::idx_t, 2, true> outLabels(
-            const_cast<Index::idx_t*>(labels), {n, k});
-
-    // TODO: Populate the rest of the params properly.
-    raft::spatial::knn::ivf_flat::search_params raft_idx_params;
-    raft_idx_params.n_probes = nprobe;
-
+    raft::spatial::knn::ivf_flat::search_params pams;
+    pams.n_probes = nprobe;
     raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
             raft_handle,
-            raft_idx_params,
+            pams,
             *raft_knn_index,
             const_cast<float*>(x),
             static_cast<std::uint32_t>(n),
             static_cast<std::uint32_t>(k),
-            static_cast<faiss::Index::idx_t*>(labels),
+            labels,
             distances);
 
     raft_handle.sync_stream();
 }
 
+void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) {
+    raft::spatial::knn::ivf_flat::index_params pams;
+
+    pams.n_lists = this->nlist;
+    switch (this->metric_type) {
+        case faiss::METRIC_L2:
+            pams.metric = raft::distance::DistanceType::L2Expanded;
+            break;
+        case faiss::METRIC_INNER_PRODUCT:
+            pams.metric = raft::distance::DistanceType::InnerProduct;
+            break;
+        default:
+            FAISS_THROW_MSG("Metric is not supported.");
+    }
+    pams.metric_arg = this->metric_arg;
+    pams.kmeans_trainset_fraction = 1.0;
+
+    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build(
+            this->raft_handle, pams, x, n_rows, uint32_t(this->d)));
+    this->raft_handle.sync_stream();
+    this->is_trained = true;
+    this->ntotal = n_rows;
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index 4960fa3ae1..cd97f426df 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -92,6 +92,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             float* distances,
             Index::idx_t* labels) const override;
 
+    void rebuildRaftIndex(const float* x, Index::idx_t n_rows);
+
     const raft::handle_t raft_handle;
     std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
 };
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index 1794e9da6d..9df27b2f3d 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -78,8 +78,8 @@ void queryTest(
         faiss::IndexFlatL2 quantizerL2(opt.dim);
         faiss::IndexFlatIP quantizerIP(opt.dim);
         faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                                  ? (faiss::Index*)&quantizerL2
-                                  : (faiss::Index*)&quantizerIP;
+                ? (faiss::Index*)&quantizerL2
+                : (faiss::Index*)&quantizerIP;
 
         faiss::IndexIVFFlat cpuIndex(
                 quantizer, opt.dim, opt.numCentroids, metricType);
@@ -128,8 +128,8 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         faiss::IndexFlatL2 quantizerL2(opt.dim);
         faiss::IndexFlatIP quantizerIP(opt.dim);
         faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                                  ? (faiss::Index*)&quantizerL2
-                                  : (faiss::Index*)&quantizerIP;
+                ? (faiss::Index*)&quantizerL2
+                : (faiss::Index*)&quantizerIP;
 
         faiss::IndexIVFFlat cpuIndex(
                 quantizer, opt.dim, opt.numCentroids, metricType);
@@ -267,42 +267,50 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
-//addTest(faiss::METRIC_L2, false);
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
-//addTest(faiss::METRIC_INNER_PRODUCT, false);
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
-//addTest(faiss::METRIC_L2, true);
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
-//addTest(faiss::METRIC_INNER_PRODUCT, true);
-//}
+TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
+    addTest(faiss::METRIC_L2, false);
+    printf("Finished addTest(faiss::METRIC_L2, false)\n");
+}
+
+TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
+    addTest(faiss::METRIC_INNER_PRODUCT, false);
+    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
+}
+
+TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
+    addTest(faiss::METRIC_L2, true);
+    printf("Finished addTest(faiss::METRIC_L2, true)\n");
+}
+
+TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
+    addTest(faiss::METRIC_INNER_PRODUCT, true);
+    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
+}
 
 //
 // General query tests
 //
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_L2) {
-queryTest(faiss::METRIC_L2, false);
+    queryTest(faiss::METRIC_L2, false);
+    printf("Finished queryTest(faiss::METRIC_L2, false);\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_IP) {
-queryTest(faiss::METRIC_INNER_PRODUCT, false);
+    queryTest(faiss::METRIC_INNER_PRODUCT, false);
+    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
 }
 
 // float16 coarse quantizer
 
 TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) {
-queryTest(faiss::METRIC_L2, true);
+    queryTest(faiss::METRIC_L2, true);
+    printf("Finished queryTest(faiss::METRIC_L2, true)\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) {
-queryTest(faiss::METRIC_INNER_PRODUCT, true);
+    queryTest(faiss::METRIC_INNER_PRODUCT, true);
+    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
 }
 
 //
@@ -311,238 +319,248 @@ queryTest(faiss::METRIC_INNER_PRODUCT, true);
 //
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) {
-queryTest(faiss::METRIC_L2, false, 64);
+    queryTest(faiss::METRIC_L2, false, 64);
+    printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) {
-queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
-queryTest(faiss::METRIC_L2, false, 128);
+    queryTest(faiss::METRIC_L2, false, 128);
+    printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
-queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
 }
 
 //
 // Copy tests
 //
 
-TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) {
-copyToTest(false);
-}
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) {
+//     copyToTest(false);
+//     printf("Finished copyToTest(false)\n");
+// }
 
 TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) {
-copyFromTest(false);
+    copyFromTest(false);
+    printf("Finished copyFromTest(false)\n");
 }
 
 TEST(TestRaftIndexIVFFlat, Float32_negative) {
-Options opt;
+    Options opt;
 
-auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-// Put all vecs on negative side
-for (auto& f : trainVecs) {
-f = std::abs(f) * -1.0f;
-}
+    // Put all vecs on negative side
+    for (auto& f : trainVecs) {
+        f = std::abs(f) * -1.0f;
+    }
 
-for (auto& f : addVecs) {
-f *= std::abs(f) * -1.0f;
-}
+    for (auto& f : addVecs) {
+        f *= std::abs(f) * -1.0f;
+    }
 
-faiss::IndexFlatIP quantizerIP(opt.dim);
-faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
+    faiss::IndexFlatIP quantizerIP(opt.dim);
+    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
 
-faiss::IndexIVFFlat cpuIndex(
-        quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
-cpuIndex.train(opt.numTrain, trainVecs.data());
-cpuIndex.add(opt.numAdd, addVecs.data());
-cpuIndex.nprobe = opt.nprobe;
+    faiss::IndexIVFFlat cpuIndex(
+            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.nprobe = opt.nprobe;
 
-faiss::gpu::RmmGpuResources res;
-res.noTempMemory();
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
 
-faiss::gpu::GpuIndexIVFFlatConfig config;
-config.device = opt.device;
-config.indicesOptions = opt.indicesOpt;
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
 
-faiss::gpu::RaftIndexIVFFlat gpuIndex(
-        &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-gpuIndex.copyFrom(&cpuIndex);
-gpuIndex.setNumProbes(opt.nprobe);
+    faiss::gpu::RaftIndexIVFFlat gpuIndex(
+            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.setNumProbes(opt.nprobe);
 
-// Construct a positive test set
-auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+    // Construct a positive test set
+    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
 
-// Put all vecs on positive size
-for (auto& f : queryVecs) {
-f = std::abs(f);
-}
+    // Put all vecs on positive size
+    for (auto& f : queryVecs) {
+        f = std::abs(f);
+    }
 
-bool compFloat16 = false;
-faiss::gpu::compareIndices(
-        queryVecs,
-        cpuIndex,
-        gpuIndex,
-        opt.numQuery,
-opt.dim,
-opt.k,
-opt.toString(),
-        compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-// FIXME: the fp16 bounds are
-// useless when math (the accumulator) is
-// in fp16. Figure out another way to test
-compFloat16 ? 0.99f : 0.1f,
-compFloat16 ? 0.65f : 0.015f);
+    bool compFloat16 = false;
+    faiss::gpu::compareIndices(
+            queryVecs,
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            // FIXME: the fp16 bounds are
+            // useless when math (the accumulator) is
+            // in fp16. Figure out another way to test
+            compFloat16 ? 0.99f : 0.1f,
+            compFloat16 ? 0.65f : 0.015f);
 }
 
 //
 // NaN tests
 //
 
-TEST(TestRaftIndexIVFFlat, QueryNaN) {
-Options opt;
-
-std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-faiss::gpu::RmmGpuResources res;
-res.noTempMemory();
-
-faiss::gpu::GpuIndexIVFFlatConfig config;
-config.device = opt.device;
-config.indicesOptions = opt.indicesOpt;
-config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-faiss::gpu::RaftIndexIVFFlat gpuIndex(
-        &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-gpuIndex.setNumProbes(opt.nprobe);
-
-gpuIndex.train(opt.numTrain, trainVecs.data());
-gpuIndex.add(opt.numAdd, addVecs.data());
-
-int numQuery = 10;
-std::vector<float> nans(
-        numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-std::vector<float> distances(numQuery * opt.k, 0);
-std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
-
-gpuIndex.search(
-        numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-for (int q = 0; q < numQuery; ++q) {
-for (int k = 0; k < opt.k; ++k) {
-EXPECT_EQ(indices[q * opt.k + k], -1);
-EXPECT_EQ(
-        distances[q * opt.k + k],
-        std::numeric_limits<float>::max());
-}
-}
-}
-
-TEST(TestRaftIndexIVFFlat, AddNaN) {
-Options opt;
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFFlat, QueryNaN) {
+//     Options opt;
+
+//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
+//     opt.dim); std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd,
+//     opt.dim);
+
+//     faiss::gpu::RmmGpuResources res;
+//     res.noTempMemory();
+
+//     faiss::gpu::GpuIndexIVFFlatConfig config;
+//     config.device = opt.device;
+//     config.indicesOptions = opt.indicesOpt;
+//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+//     faiss::gpu::RaftIndexIVFFlat gpuIndex(
+//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+//     gpuIndex.setNumProbes(opt.nprobe);
+
+//     gpuIndex.train(opt.numTrain, trainVecs.data());
+//     gpuIndex.add(opt.numAdd, addVecs.data());
+
+//     int numQuery = 10;
+//     std::vector<float> nans(
+//             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+//     std::vector<float> distances(numQuery * opt.k, 0);
+//     std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+
+//     gpuIndex.search(
+//             numQuery, nans.data(), opt.k, distances.data(), indices.data());
+
+//     for (int q = 0; q < numQuery; ++q) {
+//         for (int k = 0; k < opt.k; ++k) {
+//             EXPECT_EQ(indices[q * opt.k + k], -1);
+//             EXPECT_EQ(
+//                     distances[q * opt.k + k],
+//                     std::numeric_limits<float>::max());
+//         }
+//     }
+// }
+
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFFlat, AddNaN) {
+//     Options opt;
+
+//     faiss::gpu::RmmGpuResources res;
+//     res.noTempMemory();
+
+//     faiss::gpu::GpuIndexIVFFlatConfig config;
+//     config.device = opt.device;
+//     config.indicesOptions = opt.indicesOpt;
+//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+//     faiss::gpu::RaftIndexIVFFlat gpuIndex(
+//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+//     gpuIndex.setNumProbes(opt.nprobe);
+
+//     int numNans = 10;
+//     std::vector<float> nans(
+//             numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+//     // Make one vector valid (not the first vector, in order to test offset
+//     // issues), which should actually add
+//     for (int i = 0; i < opt.dim; ++i) {
+//         nans[opt.dim + i] = i;
+//     }
+
+//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
+//     opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data());
+
+//     // should not crash
+//     EXPECT_EQ(gpuIndex.ntotal, 0);
+//     gpuIndex.add(numNans, nans.data());
+
+//     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery,
+//     opt.dim); std::vector<float> distance(opt.numQuery * opt.k, 0);
+//     std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+
+//     // should not crash
+//     gpuIndex.search(
+//             opt.numQuery,
+//             queryVecs.data(),
+//             opt.k,
+//             distance.data(),
+//             indices.data());
+// }
 
-faiss::gpu::RmmGpuResources res;
-res.noTempMemory();
-
-faiss::gpu::GpuIndexIVFFlatConfig config;
-config.device = opt.device;
-config.indicesOptions = opt.indicesOpt;
-config.flatConfig.useFloat16 = faiss::gpu::randBool();
+TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-faiss::gpu::RaftIndexIVFFlat gpuIndex(
-        &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-gpuIndex.setNumProbes(opt.nprobe);
+    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+        return;
+    }
 
-int numNans = 10;
-std::vector<float> nans(
-        numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+    int dim = 128;
 
-// Make one vector valid (not the first vector, in order to test offset
-// issues), which should actually add
-for (int i = 0; i < opt.dim; ++i) {
-nans[opt.dim + i] = i;
-}
+    int numCentroids = 256;
+    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
+    // so just perform a small test with data allocated in the unified
+    // memory address space
+    size_t numAdd = 10000;
+    size_t numTrain = numCentroids * 40;
+    int numQuery = 10;
+    int k = 10;
+    int nprobe = 8;
 
-std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-gpuIndex.train(opt.numTrain, trainVecs.data());
+    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
 
-// should not crash
-EXPECT_EQ(gpuIndex.ntotal, 0);
-gpuIndex.add(numNans, nans.data());
+    faiss::IndexFlatL2 quantizer(dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &quantizer, dim, numCentroids, faiss::METRIC_L2);
 
-std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-std::vector<float> distance(opt.numQuery * opt.k, 0);
-std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+    cpuIndex.train(numTrain, trainVecs.data());
+    cpuIndex.add(numAdd, addVecs.data());
+    cpuIndex.nprobe = nprobe;
 
-// should not crash
-gpuIndex.search(
-        opt.numQuery,
-queryVecs.data(),
-        opt.k,
-distance.data(),
-        indices.data());
-}
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
 
-TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
-// Construct on a random device to test multi-device, if we have
-// multiple devices
-int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = device;
+    config.memorySpace = faiss::gpu::MemorySpace::Unified;
 
-if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-return;
-}
+    faiss::gpu::RaftIndexIVFFlat gpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.setNumProbes(nprobe);
 
-int dim = 128;
-
-int numCentroids = 256;
-// Unfortunately it would take forever to add 24 GB in IVFPQ data,
-// so just perform a small test with data allocated in the unified
-// memory address space
-size_t numAdd = 10000;
-size_t numTrain = numCentroids * 40;
-int numQuery = 10;
-int k = 10;
-int nprobe = 8;
-
-std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-
-faiss::IndexFlatL2 quantizer(dim);
-faiss::IndexIVFFlat cpuIndex(
-        &quantizer, dim, numCentroids, faiss::METRIC_L2);
-
-cpuIndex.train(numTrain, trainVecs.data());
-cpuIndex.add(numAdd, addVecs.data());
-cpuIndex.nprobe = nprobe;
-
-faiss::gpu::RmmGpuResources res;
-res.noTempMemory();
-
-faiss::gpu::GpuIndexIVFFlatConfig config;
-config.device = device;
-config.memorySpace = faiss::gpu::MemorySpace::Unified;
-
-faiss::gpu::RaftIndexIVFFlat gpuIndex(
-        &res, dim, numCentroids, faiss::METRIC_L2, config);
-gpuIndex.copyFrom(&cpuIndex);
-gpuIndex.setNumProbes(nprobe);
-
-faiss::gpu::compareIndices(
-        cpuIndex,
-        gpuIndex,
-        numQuery,
-        dim,
-        k,
-"Unified Memory",
-kF32MaxRelErr,
-0.1f,
-0.015f);
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
 }
 
 int main(int argc, char** argv) {

From 173c45960d25baf56a87ee797655aca42c0aebda Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 2 Aug 2022 18:43:18 -0400
Subject: [PATCH 15/87] Updates

---
 cmake/thirdparty/get_raft.cmake    |  6 +--
 faiss/gpu/GpuDistance.cu           | 79 ++++++++++++++++------------
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 82 +++++++++++++++++++-----------
 3 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 3fc2d9ae34..782b3d71dc 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -15,9 +15,9 @@
 #=============================================================================
 
 
-set(RAFT_VERSION "22.04")
+set(RAFT_VERSION "22.08")
 set(RAFT_FORK "achirkin")
-set(RAFT_PINNED_TAG "fea-knn-ivf-flat")
+set(RAFT_PINNED_TAG "enh-knn-ivf-flat-hide-impl")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
@@ -48,4 +48,4 @@ endfunction()
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         FORK             ${RAFT_FORK}
         PINNED_TAG       ${RAFT_PINNED_TAG}
-        )
\ No newline at end of file
+        )
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 6e480a239d..f671b9e5ec 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -102,22 +102,31 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
 
         // Since we've guaranteed that all arguments are on device, call the
         // implementation
-        bfKnnOnDevice<T>(
-                res,
-                device,
-                stream,
-                tVectors,
-                args.vectorsRowMajor,
-                args.vectorNorms ? &tVectorNorms : nullptr,
-                tQueries,
-                args.queriesRowMajor,
-                args.k,
-                args.metric,
-                args.metricArg,
-                tOutDistances,
-                tOutIntIndices,
-                args.ignoreOutDistances);
 
+#if defined FAISS_ENABLE_RAFT
+        // TODO: When k <= 64, invoke bfknn from RAFT
+        if (args.k <= 64) {
+
+        } else
+#endif
+
+        {
+            bfKnnOnDevice<T>(
+                    res,
+                    device,
+                    stream,
+                    tVectors,
+                    args.vectorsRowMajor,
+                    args.vectorNorms ? &tVectorNorms : nullptr,
+                    tQueries,
+                    args.queriesRowMajor,
+                    args.k,
+                    args.metric,
+                    args.metricArg,
+                    tOutDistances,
+                    tOutIntIndices,
+                    args.ignoreOutDistances);
+        }
         // Convert and copy int indices out
         auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
                 res,
@@ -146,23 +155,29 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                 stream,
                 {args.numQueries, args.k});
 
-        // Since we've guaranteed that all arguments are on device, call the
-        // implementation
-        bfKnnOnDevice<T>(
-                res,
-                device,
-                stream,
-                tVectors,
-                args.vectorsRowMajor,
-                args.vectorNorms ? &tVectorNorms : nullptr,
-                tQueries,
-                args.queriesRowMajor,
-                args.k,
-                args.metric,
-                args.metricArg,
-                tOutDistances,
-                tOutIntIndices,
-                args.ignoreOutDistances);
+#if defined FAISS_ENABLE_RAFT
+        if (args.k <= 64) {
+        } else
+#endif
+        {
+            // Since we've guaranteed that all arguments are on device, call the
+            // implementation
+            bfKnnOnDevice<T>(
+                    res,
+                    device,
+                    stream,
+                    tVectors,
+                    args.vectorsRowMajor,
+                    args.vectorNorms ? &tVectorNorms : nullptr,
+                    tQueries,
+                    args.queriesRowMajor,
+                    args.k,
+                    args.metric,
+                    args.metricArg,
+                    tOutDistances,
+                    tOutIntIndices,
+                    args.ignoreOutDistances);
+        }
 
         // Copy back if necessary
         fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 0f6e9bcf99..4ccef7bf67 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -71,8 +71,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             index->nprobe);
     nprobe = index->nprobe;
 
-    config.device = config_.device;
-
     FAISS_ASSERT(metric_type != faiss::METRIC_L2 &&
                  metric_type != faiss::METRIC_INNER_PRODUCT);
 
@@ -90,16 +88,51 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     // Since we're trained, the quantizer must have data
     FAISS_ASSERT(index->quantizer->ntotal > 0);
 
+
+//    // Copy our lists as well
+//    index_.reset(new IVFFlat(
+//            resources_.get(),
+//            quantizer->getGpuData(),  // FlatIndex instance- contains the vectors in index
+//            index->metric_type,
+//            index->metric_arg,
+//            false,   // no residual
+//            nullptr, // no scalar quantizer
+//            ivfFlatConfig_.interleavedLayout,
+//            ivfFlatConfig_.indicesOptions,
+//            config_.memorySpace));
+//
+//    // Copy all of the IVF data
+//    index_->copyInvertedListsFrom(index->invlists);  // xcopy
+
+
     raft::spatial::knn::ivf_flat::index_params raft_idx_params;
     raft_idx_params.n_lists = nlist;
     raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
 
-    // TODO: Invoke corresponding call on the RAFT side to copy quantizer
+    raft_knn_index.emplace(raft_handle, raft_idx_params, (uint32_t)d);
+
     /**
-     * For example:
-     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index<T>(
-     *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
+     * TODO: Copy centers and center norms from quantizer
+     * Things to do:
+     *    1. Copy index_->quantizer->vectors_ to raft_index->centers
+     *    2. Copy index_->quantizer->norms_ to raft_index->center_norms
+     */
+
+    raft::copy(raft_knn_index.value().centers(),
+
+
+    /**
+     * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists
+     *
+     * Things to do:
+     *    1. index->ivflists->data() is going to need to be translated over to our format
+     *       (even the interleaved format is a little different)
+     *
+     *       The GpuIndexIVFFlat has a function translateCodesToGpu_() for this
+     *
+     *    2. We will need to copy  list_sizes, indices, and list_offsets
      */
+
 }
 
 void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
@@ -109,7 +142,8 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
     if (raft_knn_index.has_value()) {
         DeviceScope scope(config_.device);
 
-        // TODO: We need to reserve memory on the raft::ivf_flat::index
+        // TODO: Need to figure out if this is absolutely necessary.
+
         /**
          * For example:
          * raft::spatial::knn::ivf_flat::allocate_ivf_lists(
@@ -119,6 +153,7 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
          *      raft_handle, *raft_knn_index,
          *      n_centroids, centroids,
          *      n_vectors, ivf);
+         *
          */
     }
 }
@@ -126,7 +161,7 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
 size_t RaftIndexIVFFlat::reclaimMemory() {
     std::cout << "Reclaiming memory" << std::endl;
 
-    // TODO: We need to reclaim memory on the raft::ivf_flat::index
+    // TODO: Need to figure out if this is absolutely necessary
     /**
      * For example:
      * raft::spatial::knn::ivf_flat::reclaim_ivf_lists(
@@ -153,23 +188,10 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     raft_idx_params.n_lists = nlist;
     raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
 
-
-    // TODO: This should only train the quantizer portion of the index
-    /**
-     * For example:
-     *
-     * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index<T>(
-     *      raft_handle, raft_idx_params, (faiss::Index::idx_t)d);
-
-     * raft::spatial::knn::ivf_flat::train_quantizer(
-     *      raft_handle, *raft_knn_index, const_cast<float*>(x), n);
-     */
-
     raft_knn_index.emplace(
         raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
                                             const_cast<float*>(x),
-                                            n, (faiss::Index::idx_t)d,
-                                            raft_handle.get_stream()));
+                                            n, (faiss::Index::idx_t)d));
 
     raft_handle.sync_stream();
 }
@@ -218,6 +240,7 @@ std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
      * raft::spatial::knn::ivf_flat::get_list_indices(
      *    raft_handle, *raft_knn_index, listId);
      */
+    Index::idx_t start_offset, stop_offset;
     std::vector<Index::idx_t> vec;
     return vec;
 }
@@ -230,26 +253,22 @@ void RaftIndexIVFFlat::addImpl_(
     FAISS_ASSERT(raft_knn_index.has_value());
     FAISS_ASSERT(n > 0);
 
-      // Data is already resident on the GPU
-    Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
-    Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
-
 //    // Not all vectors may be able to be added (some may contain NaNs etc)
 //    index_->addVectors(data, labels);
 //
 //    // but keep the ntotal based on the total number of vectors that we
 //    // attempted to add
-    ntotal += n;
 
     std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
 
-    // TODO: Invoke corresponding call in raft::ivf_flat
     /**
      * For example:
      * raft::spatial::knn::ivf_flat::add_vectors(
      *      raft_handle, *raft_knn_index, n, x, xids);
      */
-
+    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend(
+            raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n));
+    this->ntotal += n;
 }
 
 void RaftIndexIVFFlat::searchImpl_(
@@ -273,14 +292,15 @@ void RaftIndexIVFFlat::searchImpl_(
     raft::spatial::knn::ivf_flat::search_params raft_idx_params;
     raft_idx_params.n_probes = nprobe;
 
-    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(raft_handle,
+    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
+                                         raft_handle,
                                          raft_idx_params,
                                          *raft_knn_index,
                                          const_cast<float*>(x),
                                          static_cast<std::uint32_t>(n),
                                          static_cast<std::uint32_t>(k),
                                          static_cast<faiss::Index::idx_t *>(labels),
-                                         distances, raft_handle.get_stream());
+                                         distances);
 
     raft_handle.sync_stream();
 }

From 548e0f0cd251b92e12304d9d62164284b01bba22 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 2 Aug 2022 19:45:44 -0400
Subject: [PATCH 16/87] More updates

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 36 +++++-------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 71262d2e19..421b3910bb 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -76,28 +76,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 //    // Since we're trained, the quantizer must have data
 //    FAISS_ASSERT(index->quantizer->ntotal > 0);
 //
-//
-////    // Copy our lists as well
-////    index_.reset(new IVFFlat(
-////            resources_.get(),
-////            quantizer->getGpuData(),  // FlatIndex instance- contains the vectors in index
-////            index->metric_type,
-////            index->metric_arg,
-////            false,   // no residual
-////            nullptr, // no scalar quantizer
-////            ivfFlatConfig_.interleavedLayout,
-////            ivfFlatConfig_.indicesOptions,
-////            config_.memorySpace));
-////
-////    // Copy all of the IVF data
-////    index_->copyInvertedListsFrom(index->invlists);  // xcopy
-//
-//
-//    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
-//    raft_idx_params.n_lists = nlist;
-//    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-//
-//    raft_knn_index.emplace(raft_handle, raft_idx_params, (uint32_t)d);
 //
     /**
      * TODO: Copy centers and center norms from quantizer
@@ -105,10 +83,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
      *    1. Copy index_->quantizer->vectors_ to raft_index->centers
      *    2. Copy index_->quantizer->norms_ to raft_index->center_norms
      */
-//
-//    raft::copy(raft_knn_index.value().centers(),
-//
-//
     /**
      * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists
      *
@@ -130,6 +104,8 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
         FAISS_ASSERT(index->nlist == this->nlist);
 
         Index::idx_t quantizer_ntotal = index->quantizer->ntotal;
+        Index::idx_t index_ntotal = index->ntotal;
+
         std::cout << "Calling copyFrom with trained index with "  << quantizer_ntotal << " items" << std::endl;
         auto stream = raft_handle.get_stream();
 
@@ -143,9 +119,9 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
         RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal);
 
-        if(index->ntotal > 0) {
-            std::cout << "Adding " << index->ntotal << " vectors to index" << std::endl;
-            total_elems = size_t(index->ntotal) * size_t(index->d);
+        if(index_ntotal > 0) {
+            std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl;
+            total_elems = size_t(index_ntotal) * size_t(index->d);
             buf_dev.resize(total_elems, stream);
             {
                 std::vector<float> buf_host(total_elems);
@@ -153,7 +129,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
                 raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
             }
 
-            RaftIndexIVFFlat::addImpl_(index->ntotal, buf_dev.data(), nullptr);
+            RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr);
         }
     } else {
         // index is not trained, so we can remove ours as well (if there was

From baa34d7c30b61f183b0beca24823147818c76add Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 2 Aug 2022 20:01:38 -0400
Subject: [PATCH 17/87] One test running so far.

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu      |   1 +
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 288 ++++++++++++------------
 2 files changed, 150 insertions(+), 139 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 421b3910bb..bde0c7ef1e 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -193,6 +193,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
                                             n, (faiss::Index::idx_t)d));
 
     raft_handle.sync_stream();
+    this->is_trained = true;
 }
 
 int RaftIndexIVFFlat::getListLength(int listId) const {
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index 9df27b2f3d..1a560201ad 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -97,7 +97,17 @@ void queryTest(
 
         faiss::gpu::RaftIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
+//        gpuIndex.copyFrom(&cpuIndex);
+
+        raft::handle_t raft_handle;
+        rmm::device_uvector<float> trainVecsDev(trainVecs.size(), raft_handle.get_stream());
+        raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream());
+
+        rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
+        raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
+
+        gpuIndex.train(opt.numTrain, trainVecsDev.data());
+        gpuIndex.add(opt.numAdd, addVecsDev.data());
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;
@@ -267,25 +277,25 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
-    addTest(faiss::METRIC_L2, false);
-    printf("Finished addTest(faiss::METRIC_L2, false)\n");
-}
-
-TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, false);
-    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-}
-
-TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
-    addTest(faiss::METRIC_L2, true);
-    printf("Finished addTest(faiss::METRIC_L2, true)\n");
-}
-
-TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, true);
-    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-}
+//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
+//    addTest(faiss::METRIC_L2, false);
+//    printf("Finished addTest(faiss::METRIC_L2, false)\n");
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
+//    addTest(faiss::METRIC_INNER_PRODUCT, false);
+//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
+//    addTest(faiss::METRIC_L2, true);
+//    printf("Finished addTest(faiss::METRIC_L2, true)\n");
+//}
+//
+//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
+//    addTest(faiss::METRIC_INNER_PRODUCT, true);
+//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
+//}
 
 //
 // General query tests
@@ -348,71 +358,71 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
 //     printf("Finished copyToTest(false)\n");
 // }
 
-TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) {
-    copyFromTest(false);
-    printf("Finished copyFromTest(false)\n");
-}
-
-TEST(TestRaftIndexIVFFlat, Float32_negative) {
-    Options opt;
-
-    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+//TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) {
+//    copyFromTest(false);
+//    printf("Finished copyFromTest(false)\n");
+//}
 
-    // Put all vecs on negative side
-    for (auto& f : trainVecs) {
-        f = std::abs(f) * -1.0f;
-    }
-
-    for (auto& f : addVecs) {
-        f *= std::abs(f) * -1.0f;
-    }
-
-    faiss::IndexFlatIP quantizerIP(opt.dim);
-    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
-
-    faiss::IndexIVFFlat cpuIndex(
-            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-    cpuIndex.nprobe = opt.nprobe;
-
-    faiss::gpu::RmmGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-
-    faiss::gpu::RaftIndexIVFFlat gpuIndex(
-            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.setNumProbes(opt.nprobe);
-
-    // Construct a positive test set
-    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-
-    // Put all vecs on positive size
-    for (auto& f : queryVecs) {
-        f = std::abs(f);
-    }
-
-    bool compFloat16 = false;
-    faiss::gpu::compareIndices(
-            queryVecs,
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            compFloat16 ? 0.99f : 0.1f,
-            compFloat16 ? 0.65f : 0.015f);
-}
+//TEST(TestRaftIndexIVFFlat, Float32_negative) {
+//    Options opt;
+//
+//    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+//    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+//
+//    // Put all vecs on negative side
+//    for (auto& f : trainVecs) {
+//        f = std::abs(f) * -1.0f;
+//    }
+//
+//    for (auto& f : addVecs) {
+//        f *= std::abs(f) * -1.0f;
+//    }
+//
+//    faiss::IndexFlatIP quantizerIP(opt.dim);
+//    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
+//
+//    faiss::IndexIVFFlat cpuIndex(
+//            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
+//    cpuIndex.train(opt.numTrain, trainVecs.data());
+//    cpuIndex.add(opt.numAdd, addVecs.data());
+//    cpuIndex.nprobe = opt.nprobe;
+//
+//    faiss::gpu::RmmGpuResources res;
+//    res.noTempMemory();
+//
+//    faiss::gpu::GpuIndexIVFFlatConfig config;
+//    config.device = opt.device;
+//    config.indicesOptions = opt.indicesOpt;
+//
+//    faiss::gpu::RaftIndexIVFFlat gpuIndex(
+//            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+//    gpuIndex.copyFrom(&cpuIndex);
+//    gpuIndex.setNumProbes(opt.nprobe);
+//
+//    // Construct a positive test set
+//    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+//
+//    // Put all vecs on positive size
+//    for (auto& f : queryVecs) {
+//        f = std::abs(f);
+//    }
+//
+//    bool compFloat16 = false;
+//    faiss::gpu::compareIndices(
+//            queryVecs,
+//            cpuIndex,
+//            gpuIndex,
+//            opt.numQuery,
+//            opt.dim,
+//            opt.k,
+//            opt.toString(),
+//            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+//            // FIXME: the fp16 bounds are
+//            // useless when math (the accumulator) is
+//            // in fp16. Figure out another way to test
+//            compFloat16 ? 0.99f : 0.1f,
+//            compFloat16 ? 0.65f : 0.015f);
+//}
 
 //
 // NaN tests
@@ -507,61 +517,61 @@ TEST(TestRaftIndexIVFFlat, Float32_negative) {
 //             indices.data());
 // }
 
-TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-        return;
-    }
-
-    int dim = 128;
-
-    int numCentroids = 256;
-    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
-    // so just perform a small test with data allocated in the unified
-    // memory address space
-    size_t numAdd = 10000;
-    size_t numTrain = numCentroids * 40;
-    int numQuery = 10;
-    int k = 10;
-    int nprobe = 8;
-
-    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-
-    faiss::IndexFlatL2 quantizer(dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &quantizer, dim, numCentroids, faiss::METRIC_L2);
-
-    cpuIndex.train(numTrain, trainVecs.data());
-    cpuIndex.add(numAdd, addVecs.data());
-    cpuIndex.nprobe = nprobe;
-
-    faiss::gpu::RmmGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = device;
-    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-
-    faiss::gpu::RaftIndexIVFFlat gpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.setNumProbes(nprobe);
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-}
+//TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
+//    // Construct on a random device to test multi-device, if we have
+//    // multiple devices
+//    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+//
+//    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+//        return;
+//    }
+//
+//    int dim = 128;
+//
+//    int numCentroids = 256;
+//    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
+//    // so just perform a small test with data allocated in the unified
+//    // memory address space
+//    size_t numAdd = 10000;
+//    size_t numTrain = numCentroids * 40;
+//    int numQuery = 10;
+//    int k = 10;
+//    int nprobe = 8;
+//
+//    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+//    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+//
+//    faiss::IndexFlatL2 quantizer(dim);
+//    faiss::IndexIVFFlat cpuIndex(
+//            &quantizer, dim, numCentroids, faiss::METRIC_L2);
+//
+//    cpuIndex.train(numTrain, trainVecs.data());
+//    cpuIndex.add(numAdd, addVecs.data());
+//    cpuIndex.nprobe = nprobe;
+//
+//    faiss::gpu::RmmGpuResources res;
+//    res.noTempMemory();
+//
+//    faiss::gpu::GpuIndexIVFFlatConfig config;
+//    config.device = device;
+//    config.memorySpace = faiss::gpu::MemorySpace::Unified;
+//
+//    faiss::gpu::RaftIndexIVFFlat gpuIndex(
+//            &res, dim, numCentroids, faiss::METRIC_L2, config);
+//    gpuIndex.copyFrom(&cpuIndex);
+//    gpuIndex.setNumProbes(nprobe);
+//
+//    faiss::gpu::compareIndices(
+//            cpuIndex,
+//            gpuIndex,
+//            numQuery,
+//            dim,
+//            k,
+//            "Unified Memory",
+//            kF32MaxRelErr,
+//            0.1f,
+//            0.015f);
+//}
 
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);

From edc59916f9ee2570d01e5bc60e939885257e0f7c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 3 Aug 2022 10:25:29 -0400
Subject: [PATCH 18/87] Setting add_data_on_build = false;

---
 faiss/gpu/raft/RaftIndexIVFFlat.cu      |  8 +++----
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 32 ++++++++++++-------------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index bde0c7ef1e..1e29f7f473 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -186,6 +186,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     raft::spatial::knn::ivf_flat::index_params raft_idx_params;
     raft_idx_params.n_lists = nlist;
     raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+    raft_idx_params.add_data_on_build = false;
 
     raft_knn_index.emplace(
         raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
@@ -278,11 +279,6 @@ void RaftIndexIVFFlat::addImpl_(
 
     std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
 
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::add_vectors(
-     *      raft_handle, *raft_knn_index, n, x, xids);
-     */
     raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend(
             raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n));
     this->ntotal += n;
@@ -332,9 +328,11 @@ void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) {
     }
     pams.metric_arg = this->metric_arg;
     pams.kmeans_trainset_fraction = 1.0;
+    pams.add_data_on_build = false;
 
     raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build(
             this->raft_handle, pams, x, n_rows, uint32_t(this->d)));
+
     this->raft_handle.sync_stream();
     this->is_trained = true;
     this->ntotal = n_rows;
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index 1a560201ad..cefcf6654a 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -306,10 +306,10 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2) {
     printf("Finished queryTest(faiss::METRIC_L2, false);\n");
 }
 
-TEST(TestRaftIndexIVFFlat, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false);
-    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-}
+//TEST(TestRaftIndexIVFFlat, Float32_Query_IP) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
+//}
 
 // float16 coarse quantizer
 
@@ -318,10 +318,10 @@ TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) {
     printf("Finished queryTest(faiss::METRIC_L2, true)\n");
 }
 
-TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, true);
-    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-}
+//TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, true);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
+//}
 
 //
 // There are IVF list scanning specializations for 64-d and 128-d that we
@@ -333,20 +333,20 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) {
     printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n");
 }
 
-TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
-    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
-}
+//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
+//}
 
 TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
     queryTest(faiss::METRIC_L2, false, 128);
     printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n");
 }
 
-TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
-    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
-}
+//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
+//}
 
 //
 // Copy tests

From 10f89b45c2869dd4feb0c8083d7fb0a6f945c2c9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 8 Aug 2022 13:31:52 -0400
Subject: [PATCH 19/87] Copying centroids directly and adding some prints for
 the test outputs

---
 faiss/gpu/GpuIndexIVF.h            |  2 +-
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 37 +++++++++++++++++++++++++-----
 faiss/gpu/test/TestUtils.cpp       | 29 +++++++++++++++++++++++
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index 834cb061ce..391d32f28f 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -97,7 +97,7 @@ class GpuIndexIVF : public GpuIndex {
     /// Exposing this like the CPU version for manipulation
     int nprobe;
 
-    /// Exposeing this like the CPU version for query
+    /// Exposing this like the CPU version for query
     GpuIndexFlat* quantizer;
 
    protected:
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 1e29f7f473..d99f883ff3 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -9,6 +9,7 @@
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -110,15 +111,39 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
         auto stream = raft_handle.get_stream();
 
         auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d);
-        rmm::device_uvector<float> buf_dev(total_elems, stream);
-        {
-            std::vector<float> buf_host(total_elems);
-            index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-            raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
+
+        raft::spatial::knn::ivf_flat::index_params pams;
+        switch (this->metric_type) {
+            case faiss::METRIC_L2:
+                pams.metric = raft::distance::DistanceType::L2Expanded;
+                break;
+            case faiss::METRIC_INNER_PRODUCT:
+                pams.metric = raft::distance::DistanceType::InnerProduct;
+                break;
+            default:
+                FAISS_THROW_MSG("Metric is not supported.");
         }
 
-        RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal);
+        raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d);
+
+        raft::copy(raft_knn_index.value().centers().data_handle(),
+                   quantizer->getGpuData()->getVectorsRef<float>().data(),
+                   total_elems,
+                   raft_handle.get_stream());
+
 
+        // TODO: Need to compute the norms, I guess
+//        raft::copy(raft_knn_index.value().center_norms().value().data_handle(), quantizer->getGpuData()->norms_, quantizer_ntotal, raft_handle.get_stream());
+
+//        {
+//            std::vector<float> buf_host(total_elems);
+//            index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
+//            raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
+//        }
+
+//        RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal);
+//
+        rmm::device_uvector<float> buf_dev(total_elems, stream);
         if(index_ntotal > 0) {
             std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl;
             total_elems = size_t(index_ntotal) * size_t(index->d);
diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp
index 29fbef0335..2290cbb628 100644
--- a/faiss/gpu/test/TestUtils.cpp
+++ b/faiss/gpu/test/TestUtils.cpp
@@ -104,6 +104,35 @@ void compareIndices(
             testDistance.data(),
             testIndices.data());
 
+    int idx = 4;
+
+    int start_idx = idx * numQuery;
+    int stop_idx = start_idx + k;
+    printf("ref inds: [");
+    for(int i = start_idx; i < stop_idx; i++) {
+        printf("%d, ", int(refIndices[i]));
+    }
+    printf("]\n");
+
+    printf("test inds: [");
+    for(int i = start_idx; i < stop_idx; i++) {
+        printf("%d, ", int(testIndices[i]));
+    }
+    printf("]\n");
+
+    printf("ref dists: [");
+    for(int i = start_idx; i < stop_idx; i++) {
+        printf("%f, ", float(refDistance[i]));
+    }
+    printf("]\n");
+
+    printf("test dists: [");
+    for(int i = start_idx; i < stop_idx; i++) {
+        printf("%f, ", float(testDistance[i]));
+    }
+    printf("]\n");
+
+
     faiss::gpu::compareLists(
             refDistance.data(),
             refIndices.data(),

From 7c690200f490b809051a4507ed64cebe7d3e5a8e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 8 Aug 2022 15:30:00 -0400
Subject: [PATCH 20/87] reconstructions seems to be reasonable

---
 faiss/gpu/GpuIndexIVF.cu                |  2 +
 faiss/gpu/raft/RaftIndexIVFFlat.cu      | 50 +++++++------------------
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 14 ++++---
 3 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 2df20fe2e8..a3c3ec6f73 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -231,6 +231,8 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
     clus.train(n, x, *quantizer);
     quantizer->is_trained = true;
 
+
+
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
 
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index d99f883ff3..1b5feaafce 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -59,25 +59,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             (size_t)getMaxKSelection(),
             index->nprobe);
 
-//    FAISS_ASSERT(metric_type != faiss::METRIC_L2 &&
-//                 metric_type != faiss::METRIC_INNER_PRODUCT);
-//
-//    if (!index->is_trained) {
-//        // copied in GpuIndex::copyFrom
-//        FAISS_ASSERT(!is_trained && ntotal == 0);
-//        return;
-
-//    }
-//
-//    // copied in GpuIndex::copyFrom
-//    // ntotal can exceed max int, but the number of vectors per inverted
-//    // list cannot exceed this. We check this in the subclasses.
-//    FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
-//
-//    // Since we're trained, the quantizer must have data
-//    FAISS_ASSERT(index->quantizer->ntotal > 0);
-//
-//
     /**
      * TODO: Copy centers and center norms from quantizer
      * Things to do:
@@ -113,6 +94,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
         auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d);
 
         raft::spatial::knn::ivf_flat::index_params pams;
+
         switch (this->metric_type) {
             case faiss::METRIC_L2:
                 pams.metric = raft::distance::DistanceType::L2Expanded;
@@ -126,34 +108,27 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
         raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d);
 
-        raft::copy(raft_knn_index.value().centers().data_handle(),
-                   quantizer->getGpuData()->getVectorsRef<float>().data(),
-                   total_elems,
-                   raft_handle.get_stream());
-
-
-        // TODO: Need to compute the norms, I guess
-//        raft::copy(raft_knn_index.value().center_norms().value().data_handle(), quantizer->getGpuData()->norms_, quantizer_ntotal, raft_handle.get_stream());
-
-//        {
-//            std::vector<float> buf_host(total_elems);
-//            index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-//            raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
-//        }
-
-//        RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal);
-//
+        // Copy (reconstructed) centroids over, rather than re-training
         rmm::device_uvector<float> buf_dev(total_elems, stream);
+        {
+            std::vector<float> buf_host(total_elems);
+            index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
+            raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
+        }
+
+        // Add (reconstructed) vectors to index if needed
         if(index_ntotal > 0) {
             std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl;
             total_elems = size_t(index_ntotal) * size_t(index->d);
             buf_dev.resize(total_elems, stream);
             {
                 std::vector<float> buf_host(total_elems);
-                index->reconstruct_n(0, index->ntotal, buf_host.data());
+                index->reconstruct_n(0, index_ntotal, buf_host.data());
                 raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
             }
 
+            // TODO: We might want to consider moving the centroid norm computation
+            // outside of the incremental add on the RAFT side.
             RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr);
         }
     } else {
@@ -212,6 +187,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     raft_idx_params.n_lists = nlist;
     raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
     raft_idx_params.add_data_on_build = false;
+    raft_idx_params.kmeans_n_iters = 100;
 
     raft_knn_index.emplace(
         raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index cefcf6654a..c56d59442f 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -84,6 +84,8 @@ void queryTest(
         faiss::IndexIVFFlat cpuIndex(
                 quantizer, opt.dim, opt.numCentroids, metricType);
         cpuIndex.train(opt.numTrain, trainVecs.data());
+
+        std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
         cpuIndex.add(opt.numAdd, addVecs.data());
         cpuIndex.nprobe = opt.nprobe;
 
@@ -97,17 +99,17 @@ void queryTest(
 
         faiss::gpu::RaftIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-//        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.copyFrom(&cpuIndex);
 
         raft::handle_t raft_handle;
-        rmm::device_uvector<float> trainVecsDev(trainVecs.size(), raft_handle.get_stream());
-        raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream());
-
+//        rmm::device_uvector<float> trainVecsDev(trainVecs.size(), raft_handle.get_stream());
+//        raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream());
+//
         rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
         raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
 
-        gpuIndex.train(opt.numTrain, trainVecsDev.data());
-        gpuIndex.add(opt.numAdd, addVecsDev.data());
+//        gpuIndex.train(opt.numTrain, trainVecsDev.data());
+//        gpuIndex.add(opt.numAdd, addVecsDev.data());
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;

From 8be77467cca175cb9c684f44da664591c2c5e804 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 9 Aug 2022 17:19:02 -0400
Subject: [PATCH 21/87] iUpdates to tests to compare against brute force as
 ground truth

---
 faiss/gpu/GpuIndexIVFFlat.cu            |   4 +
 faiss/gpu/impl/FlatIndex.cuh            |   5 +
 faiss/gpu/raft/RaftIndexIVFFlat.cu      |   2 +
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 151 ++++++++++++++++++------
 4 files changed, 127 insertions(+), 35 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 3b32b83f84..27fe6a8d43 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -5,6 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <raft/core/cudart_utils.hpp>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -12,6 +13,7 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
@@ -91,6 +93,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace));
 
+    raft::print_device_vector("faiss centers", quantizer->getGpuData()->vectors(), 50, std::cout);
+
     // Copy all of the IVF data
     index_->copyInvertedListsFrom(index->invlists);
 }
diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh
index d701f78416..21c348d477 100644
--- a/faiss/gpu/impl/FlatIndex.cuh
+++ b/faiss/gpu/impl/FlatIndex.cuh
@@ -60,6 +60,11 @@ class FlatIndex {
             int num,
             cudaStream_t stream);
 
+
+    float *vectors() {
+        return vectors_.data();
+    }
+
     void query(
             Tensor<float, 2, true>& vecs,
             int k,
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 1b5feaafce..9bc8d64d8d 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -116,6 +116,8 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
         }
 
+        raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
+
         // Add (reconstructed) vectors to index if needed
         if(index_ntotal > 0) {
             std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl;
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index c56d59442f..be9edec8db 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -11,6 +11,9 @@
 #include <faiss/gpu/raft/RmmGpuResources.hpp>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/raft/RmmGpuResources.hpp>
 #include <gtest/gtest.h>
 #include <cmath>
 #include <sstream>
@@ -63,6 +66,50 @@ struct Options {
     faiss::gpu::IndicesOptions indicesOpt;
 };
 
+template<typename idx_type>
+void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
+    index.train(opt.numTrain, trainVecs.data());
+    index.add(opt.numAdd, addVecs.data());
+    index.setNumProbes(opt.nprobe);
+}
+
+
+void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m,
+                  std::vector<float> &addVecs, std::vector<float> &queryVecs) {
+
+
+
+    faiss::gpu::RmmGpuResources gpu_res;
+    gpu_res.setDefaultStream(opt.device, raft_handle.get_stream());
+
+    rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
+    raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
+
+    rmm::device_uvector<float> queryVecsDev(queryVecs.size(), raft_handle.get_stream());
+    raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream());
+
+    faiss::gpu::GpuDistanceParams args;
+    args.metric          = m;
+    args.k               = opt.k;
+    args.dims            = opt.dim;
+    args.vectors         = addVecs.data();
+    args.vectorsRowMajor = true;
+    args.numVectors      = opt.numAdd;
+    args.queries         = queryVecs.data();
+    args.queriesRowMajor = true;
+    args.numQueries      = opt.numQuery;
+    args.outDistances    = dists;
+    args.outIndices      = inds;
+    args.outIndicesType  = faiss::gpu::IndicesDataType::I64;
+
+    /**
+     * @todo: Until FAISS supports pluggable allocation strategies,
+     * we will not reap the benefits of the pool allocator for
+     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+     */
+    bfKnn(&gpu_res, args);
+}
+
 void queryTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer,
@@ -75,19 +122,9 @@ void queryTest(
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        faiss::IndexFlatL2 quantizerL2(opt.dim);
-        faiss::IndexFlatIP quantizerIP(opt.dim);
-        faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                ? (faiss::Index*)&quantizerL2
-                : (faiss::Index*)&quantizerIP;
-
-        faiss::IndexIVFFlat cpuIndex(
-                quantizer, opt.dim, opt.numCentroids, metricType);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
+        std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
 
         std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.nprobe = opt.nprobe;
 
         faiss::gpu::RmmGpuResources res;
         res.noTempMemory();
@@ -97,35 +134,79 @@ void queryTest(
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
-        faiss::gpu::RaftIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-
+        // TODO: Since we are modifying the centroids when adding new vectors,
+        // the neighbors are no longer going to match completely between CPU
+        // and the RAFT indexes. We will probably want to perform a bfknn as
+        // ground truth and then compare the recall for both the RAFT and FAISS
+        // indices.
         raft::handle_t raft_handle;
-//        rmm::device_uvector<float> trainVecsDev(trainVecs.size(), raft_handle.get_stream());
-//        raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream());
-//
-        rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
-        raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
 
-//        gpuIndex.train(opt.numTrain, trainVecsDev.data());
-//        gpuIndex.add(opt.numAdd, addVecsDev.data());
-        gpuIndex.setNumProbes(opt.nprobe);
+        faiss::gpu::RaftIndexIVFFlat raftIndex(
+                &res, opt.dim, opt.numCentroids, metricType, config);
 
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, opt.dim, opt.numCentroids, metricType, config);
+
+        std::cout << "Training raft index" << std::endl;
+        train_index(raft_handle, opt, raftIndex, trainVecs, addVecs);
+
+        std::cout << "Training gpu index" << std::endl;
+        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
+
+        std::cout << "Computing ground truth" << std::endl;
+        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
+
+        std::cout << "Done." << std::endl;
+        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
+
+        rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        raftIndex.search(
                 opt.numQuery,
-                opt.dim,
+                queryVecs.data(),
                 opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                // FIXME: the fp16 bounds are
-                // useless when math (the accumulator) is
-                // in fp16. Figure out another way to test
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.65f : 0.015f);
+                raft_dists.data(),
+                raft_inds.data());
+
+        rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                gpu_dists.data(),
+                gpu_inds.data());
+
+
+        // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
+
+        raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout);
+
+        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
+
+//
+//        bool compFloat16 = useFloat16CoarseQuantizer;
+//        faiss::gpu::compareIndices(
+//                cpuIndex,
+//                gpuIndex,
+//                opt.numQuery,
+//                opt.dim,
+//                opt.k,
+//                opt.toString(),
+//                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+//                // FIXME: the fp16 bounds are
+//                // useless when math (the accumulator) is
+//                // in fp16. Figure out another way to test
+//                compFloat16 ? 0.70f : 0.1f,
+//                compFloat16 ? 0.65f : 0.015f);
     }
 }
 

From 933582a11d49139aa08896c77d1286bdb9aab6dc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 24 Aug 2022 18:04:54 -0400
Subject: [PATCH 22/87] Starting to look at resulting runtimes in raft ivf flat
 tests

---
 build.sh                                |  7 +++----
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/build.sh b/build.sh
index 7ff0577e29..c446b9bc0c 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-BUILD_TYPE=Debug
+BUILD_TYPE=Release
 
 RAFT_REPO_REL="../raft"
 RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
@@ -29,13 +29,12 @@ cmake \
  -DBUILD_SHARED_LIBS=OFF \
  -DFAISS_ENABLE_RAFT=ON \
  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DCPM_raft_SOURCE="${RAFT_REPO_PATH}" \
  -DFAISS_OPT_LEVEL=avx2 \
- -DCMAKE_CUDA_ARCHITECTURES="86" \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
  -B build .
 
-make -C build -j
+make -C build -j12
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index be9edec8db..b9f6bc56e2 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -13,7 +13,7 @@
 #include <faiss/gpu/utils/DeviceUtils.h>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/raft/RmmGpuResources.hpp>
+#include <raft/core/cudart_utils.hpp>
 #include <gtest/gtest.h>
 #include <cmath>
 #include <sstream>
@@ -25,13 +25,13 @@ constexpr float kF32MaxRelErr = 0.03f;
 
 struct Options {
     Options() {
-        numAdd = 2 * faiss::gpu::randVal(2000, 5000);
+        numAdd = 2 * faiss::gpu::randVal(20000, 50000);
         dim = faiss::gpu::randVal(64, 200);
 
         numCentroids = std::sqrt((float)numAdd / 2);
         numTrain = numCentroids * 40;
         nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
-        numQuery = faiss::gpu::randVal(32, 100);
+        numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100);
 
         // Due to the approximate nature of the query and of floating point
         // differences between GPU and CPU, to stay within our error bounds,
@@ -70,6 +70,7 @@ template<typename idx_type>
 void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
     index.train(opt.numTrain, trainVecs.data());
     index.add(opt.numAdd, addVecs.data());
+//    index.train(opt.numTrain, trainVecs.data());
     index.setNumProbes(opt.nprobe);
 }
 
@@ -166,6 +167,7 @@ void queryTest(
         rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
 
+        uint32_t rstart = raft::curTimeMillis();
         raftIndex.search(
                 opt.numQuery,
                 queryVecs.data(),
@@ -173,9 +175,14 @@ void queryTest(
                 raft_dists.data(),
                 raft_inds.data());
 
+        raft_handle.sync_stream();
+        uint32_t rstop = raft::curTimeMillis();
+        std::cout << "Raft time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
+
         rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
 
+        uint32_t gstart = raft::curTimeMillis();
         gpuIndex.search(
                 opt.numQuery,
                 queryVecs.data(),
@@ -183,6 +190,10 @@ void queryTest(
                 gpu_dists.data(),
                 gpu_inds.data());
 
+        raft_handle.sync_stream();
+        uint32_t gstop = raft::curTimeMillis();
+
+        std::cout << "FAISS time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
 
         // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
 

From d2a65417a6b4163a46e7421888aad697f0b840d1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 24 Aug 2022 18:56:51 -0400
Subject: [PATCH 23/87] Adding timing info to raft test

---
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index b9f6bc56e2..f0b6ac72d5 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -25,7 +25,7 @@ constexpr float kF32MaxRelErr = 0.03f;
 
 struct Options {
     Options() {
-        numAdd = 2 * faiss::gpu::randVal(20000, 50000);
+        numAdd = 2 * faiss::gpu::randVal(50000, 70000);
         dim = faiss::gpu::randVal(64, 200);
 
         numCentroids = std::sqrt((float)numAdd / 2);
@@ -68,10 +68,20 @@ struct Options {
 
 template<typename idx_type>
 void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
+
+    uint32_t train_start = raft::curTimeMillis();
     index.train(opt.numTrain, trainVecs.data());
+    raft_handle.sync_stream();
+    uint32_t train_stop = raft::curTimeMillis();
+
+    uint32_t add_start = raft::curTimeMillis();
     index.add(opt.numAdd, addVecs.data());
+    raft_handle.sync_stream();
+    uint32_t add_stop = raft::curTimeMillis();
 //    index.train(opt.numTrain, trainVecs.data());
     index.setNumProbes(opt.nprobe);
+
+    std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl;
 }
 
 
@@ -149,10 +159,18 @@ void queryTest(
                 &res, opt.dim, opt.numCentroids, metricType, config);
 
         std::cout << "Training raft index" << std::endl;
+        uint32_t r_train_start = raft::curTimeMillis();
         train_index(raft_handle, opt, raftIndex, trainVecs, addVecs);
+        raft_handle.sync_stream();
+        uint32_t r_train_stop = raft::curTimeMillis();
+        std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
 
         std::cout << "Training gpu index" << std::endl;
+        uint32_t g_train_start = raft::curTimeMillis();
         train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
+        raft_handle.sync_stream();
+        uint32_t g_train_stop = raft::curTimeMillis();
+        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
 
         std::cout << "Computing ground truth" << std::endl;
         rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
@@ -177,7 +195,7 @@ void queryTest(
 
         raft_handle.sync_stream();
         uint32_t rstop = raft::curTimeMillis();
-        std::cout << "Raft time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
+        std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
 
         rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
@@ -193,7 +211,7 @@ void queryTest(
         raft_handle.sync_stream();
         uint32_t gstop = raft::curTimeMillis();
 
-        std::cout << "FAISS time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
+        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
 
         // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
 

From 986407a23ed31b9695619d94fea968347b920955 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 25 Aug 2022 19:39:49 -0400
Subject: [PATCH 24/87] Updating for rapids-cmake updates and RAFT updates

---
 CMakeLists.txt                          | 40 ++++++++-------
 build.sh                                |  9 ++--
 cmake/thirdparty/get_raft.cmake         |  8 +--
 faiss/gpu/raft/RaftIndexIVFFlat.cu      | 17 ++++++-
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 65 +++++++++++++------------
 fetch_rapids.cmake                      | 17 +++++++
 6 files changed, 94 insertions(+), 62 deletions(-)
 create mode 100644 fetch_rapids.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 750cba414e..52da03120e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,17 +6,16 @@
 
 cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake
-        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
-include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
-include(rapids-cmake)
-include(rapids-cpm)
-include(rapids-cuda)
-include(rapids-export)
-include(rapids-find)
-
-rapids_cuda_init_architectures(faiss)
+# Valid values are "generic", "avx2".
+option(FAISS_OPT_LEVEL "" "generic")
+option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
+option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
+option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
+option(FAISS_ENABLE_C_API "Build C API." OFF)
 
+if(FAISS_ENABLE_RAFT)
+  include(fetch_rapids.cmake)
+endif()
 
 project(faiss
   VERSION 1.6.4
@@ -29,23 +28,22 @@ set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
-# Valid values are "generic", "avx2".
-option(FAISS_OPT_LEVEL "" "generic")
-option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
-option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
-option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
-option(FAISS_ENABLE_C_API "Build C API." OFF)
+if(FAISS_ENABLE_RAFT)
+  include(rapids-cmake)
+  include(rapids-cpm)
+  rapids_cpm_init()
+  include(rapids-cuda)
+  include(rapids-export)
+  include(rapids-find)
+  rapids_cuda_init_architectures(faiss)
+  include(cmake/thirdparty/get_raft.cmake)
+endif()
 
 if(FAISS_ENABLE_GPU)
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   enable_language(CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT)
-  rapids_cpm_init()
-  include(cmake/thirdparty/get_raft.cmake)
-endif()
-
 add_subdirectory(faiss)
 
 if(FAISS_ENABLE_GPU)
diff --git a/build.sh b/build.sh
index c446b9bc0c..25000112e5 100755
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@
 
 BUILD_TYPE=Release
 
-RAFT_REPO_REL="../raft"
+RAFT_REPO_REL="/share/workspace/rapids_projects/raft"
 RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
 
 set -e
@@ -22,19 +22,22 @@ if [ "$1" == "test-raft" ]; then
   exit 0
 fi
 
+mkdir -p build/ && cd build/
 cmake \
  -DFAISS_ENABLE_GPU=ON \
  -DFAISS_ENABLE_PYTHON=OFF \
  -DBUILD_TESTING=ON \
  -DBUILD_SHARED_LIBS=OFF \
+ -DCPM_raft_SOURCE=${RAFT_REPO_REL} \
  -DFAISS_ENABLE_RAFT=ON \
  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
  -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=ON \
  -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- -B build .
+ ../
 
-make -C build -j12
+cmake  --build . -j12
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 782b3d71dc..c16b4ad489 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -15,9 +15,9 @@
 #=============================================================================
 
 
-set(RAFT_VERSION "22.08")
+set(RAFT_VERSION "22.10")
 set(RAFT_FORK "achirkin")
-set(RAFT_PINNED_TAG "enh-knn-ivf-flat-hide-impl")
+set(RAFT_PINNED_TAG "enh-knn-kmeans-more-gpu")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
@@ -29,8 +29,8 @@ function(find_and_configure_raft)
     #-----------------------------------------------------
     rapids_cpm_find(raft ${PKG_VERSION}
             GLOBAL_TARGETS      raft::raft
-            BUILD_EXPORT_SET    projname-exports
-            INSTALL_EXPORT_SET  projname-exports
+            BUILD_EXPORT_SET    faiss-exports
+            INSTALL_EXPORT_SET  faiss-exports
             CPM_ARGS
             GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
             GIT_TAG        ${PKG_PINNED_TAG}
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index 9bc8d64d8d..f80a59357b 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -16,6 +16,7 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
+#include <raft/core/nvtx.hpp>
 #include <raft/spatial/knn/ivf_flat.cuh>
 
 #include <limits>
@@ -106,7 +107,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
                 FAISS_THROW_MSG("Metric is not supported.");
         }
 
-        raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d);
+        raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d);
 
         // Copy (reconstructed) centroids over, rather than re-training
         rmm::device_uvector<float> buf_dev(total_elems, stream);
@@ -179,7 +180,13 @@ size_t RaftIndexIVFFlat::reclaimMemory() {
 void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
+
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+            "RaftIndexIVFFlat::train (%ld)", n);
+
     std::cout << "Calling train() with " << n << " rows" << std::endl;
+
+    uint32_t start = raft::curTimeMillis();
     if (this->is_trained) {
         FAISS_ASSERT(raft_knn_index.has_value());
         return;
@@ -197,6 +204,9 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
                                             n, (faiss::Index::idx_t)d));
 
     raft_handle.sync_stream();
+    uint32_t stop = raft::curTimeMillis();
+
+    std::cout << "train took " << (stop - start) << "ms. " << std::endl;
     this->is_trained = true;
 }
 
@@ -293,12 +303,15 @@ void RaftIndexIVFFlat::searchImpl_(
         int k,
         float* distances,
         Index::idx_t* labels) const {
+
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+            "RaftIndexIVFFlat::searchImpl_ (%ld)", n);
+
     // Device is already set in GpuIndex::search
     FAISS_ASSERT(raft_knn_index.has_value());
     FAISS_ASSERT(n > 0);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
 
-    std::cout << "Calling searchImpl_ with " << n << " rows" << std::endl;
     raft::spatial::knn::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
     raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index f0b6ac72d5..d56246860f 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -14,6 +14,7 @@
 
 #include <faiss/gpu/GpuDistance.h>
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/nvtx.hpp>
 #include <gtest/gtest.h>
 #include <cmath>
 #include <sstream>
@@ -29,7 +30,7 @@ struct Options {
         dim = faiss::gpu::randVal(64, 200);
 
         numCentroids = std::sqrt((float)numAdd / 2);
-        numTrain = numCentroids * 40;
+        numTrain = numCentroids * 50;
         nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
         numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100);
 
@@ -155,8 +156,8 @@ void queryTest(
         faiss::gpu::RaftIndexIVFFlat raftIndex(
                 &res, opt.dim, opt.numCentroids, metricType, config);
 
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, opt.dim, opt.numCentroids, metricType, config);
+//        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+//                &res, opt.dim, opt.numCentroids, metricType, config);
 
         std::cout << "Training raft index" << std::endl;
         uint32_t r_train_start = raft::curTimeMillis();
@@ -165,22 +166,22 @@ void queryTest(
         uint32_t r_train_stop = raft::curTimeMillis();
         std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
 
-        std::cout << "Training gpu index" << std::endl;
-        uint32_t g_train_start = raft::curTimeMillis();
-        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
-        raft_handle.sync_stream();
-        uint32_t g_train_stop = raft::curTimeMillis();
-        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
-
-        std::cout << "Computing ground truth" << std::endl;
-        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
+//        std::cout << "Training gpu index" << std::endl;
+//        uint32_t g_train_start = raft::curTimeMillis();
+//        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
+//        raft_handle.sync_stream();
+//        uint32_t g_train_stop = raft::curTimeMillis();
+//        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
 
-        std::cout << "Done." << std::endl;
-        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
+//        std::cout << "Computing ground truth" << std::endl;
+//        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+//        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+//
+//        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
+//
+//        std::cout << "Done." << std::endl;
+//        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
+//        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
 
         rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
@@ -200,26 +201,26 @@ void queryTest(
         rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
 
-        uint32_t gstart = raft::curTimeMillis();
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                gpu_dists.data(),
-                gpu_inds.data());
-
-        raft_handle.sync_stream();
-        uint32_t gstop = raft::curTimeMillis();
-
-        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
+//        uint32_t gstart = raft::curTimeMillis();
+//        gpuIndex.search(
+//                opt.numQuery,
+//                queryVecs.data(),
+//                opt.k,
+//                gpu_dists.data(),
+//                gpu_inds.data());
+//
+//        raft_handle.sync_stream();
+//        uint32_t gstop = raft::curTimeMillis();
+//
+//        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
 
         // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
 
         raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout);
         raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout);
 
-        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
+//        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
+//        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
 
 //
 //        bool compFloat16 = useFloat16CoarseQuantizer;
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
new file mode 100644
index 0000000000..0589dc9ddb
--- /dev/null
+++ b/fetch_rapids.cmake
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
+        ${CMAKE_BINARY_DIR}/RAPIDS.cmake
+        )
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)

From ae4ed98c275300c3f8d46046f9ba0c81d05e704c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 10 Oct 2022 13:59:27 -0400
Subject: [PATCH 25/87] Adding RaftIndexIVFPQ

---
 CMakeLists.txt                          |  48 +--
 build.sh                                |   2 +-
 cmake/thirdparty/get_raft.cmake         |   4 +-
 faiss/gpu/CMakeLists.txt                |   4 +-
 faiss/gpu/GpuDistance.cu                |   7 +-
 faiss/gpu/raft/RaftIndexIVFFlat.cu      |   5 +-
 faiss/gpu/raft/RaftIndexIVFFlat.h       |   3 -
 faiss/gpu/raft/RaftIndexIVFPQ.cu        | 396 ++++++++++++++++++++++++
 faiss/gpu/raft/RaftIndexIVFPQ.h         | 152 +++++++++
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp |  68 ++--
 10 files changed, 626 insertions(+), 63 deletions(-)
 create mode 100644 faiss/gpu/raft/RaftIndexIVFPQ.cu
 create mode 100644 faiss/gpu/raft/RaftIndexIVFPQ.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 52da03120e..d5ab7c6421 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,44 +6,46 @@
 
 cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
-# Valid values are "generic", "avx2".
-option(FAISS_OPT_LEVEL "" "generic")
-option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
-option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
-option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
-option(FAISS_ENABLE_C_API "Build C API." OFF)
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
+        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cuda_init_architectures(faiss)
 
-if(FAISS_ENABLE_RAFT)
-  include(fetch_rapids.cmake)
-endif()
 
 project(faiss
-  VERSION 1.6.4
-  DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
-  HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
-  LANGUAGES CXX)
+        VERSION 1.6.4
+        DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
+        HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
+        LANGUAGES CXX)
 include(GNUInstallDirs)
 
 set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
-if(FAISS_ENABLE_RAFT)
-  include(rapids-cmake)
-  include(rapids-cpm)
-  rapids_cpm_init()
-  include(rapids-cuda)
-  include(rapids-export)
-  include(rapids-find)
-  rapids_cuda_init_architectures(faiss)
-  include(cmake/thirdparty/get_raft.cmake)
-endif()
+# Valid values are "generic", "avx2".
+option(FAISS_OPT_LEVEL "" "generic")
+option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
+option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
+option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
+option(FAISS_ENABLE_C_API "Build C API." OFF)
 
 if(FAISS_ENABLE_GPU)
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   enable_language(CUDA)
 endif()
 
+if(FAISS_ENABLE_RAFT)
+  rapids_cpm_init()
+  include(cmake/thirdparty/get_raft.cmake)
+endif()
+
 add_subdirectory(faiss)
 
 if(FAISS_ENABLE_GPU)
diff --git a/build.sh b/build.sh
index 25000112e5..a37468d665 100755
--- a/build.sh
+++ b/build.sh
@@ -32,7 +32,7 @@ cmake \
  -DFAISS_ENABLE_RAFT=ON \
  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
  -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=ON \
+ -DRAFT_NVTX=OFF \
  -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index c16b4ad489..5a06fa1ae7 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -16,8 +16,8 @@
 
 
 set(RAFT_VERSION "22.10")
-set(RAFT_FORK "achirkin")
-set(RAFT_PINNED_TAG "enh-knn-kmeans-more-gpu")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-22.10")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index f157e6e7ec..4cd8160f17 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS
 )
 
 if(FAISS_ENABLE_RAFT)
-  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h)
-  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu)
+  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIndexIVFPQ.h)
+  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIndexIVFPQ.cu)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index f671b9e5ec..ba1056f04a 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -14,6 +14,11 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 
+#ifdef FAISS_ENABLE_RAFT
+// TODO: Expose fused_l2_knn
+#include <raft/spatial/knn/brute_force_knn.cuh>
+#endif
+
 namespace faiss {
 namespace gpu {
 
@@ -103,7 +108,7 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
         // Since we've guaranteed that all arguments are on device, call the
         // implementation
 
-#if defined FAISS_ENABLE_RAFT
+#ifdef FAISS_ENABLE_RAFT
         // TODO: When k <= 64, invoke bfknn from RAFT
         if (args.k <= 64) {
 
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index f80a59357b..fe77aa1d1e 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -40,7 +40,10 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
         : GpuIndexIVFFlat(provider, dims, nlist, metric, config),
-          raft_handle(resources_->getDefaultStream(config_.device)) {}
+          raft_handle(resources_->getDefaultStream(config_.device)) {
+
+    std::cout << "In raft index constructor" << std::endl;
+}
 
 RaftIndexIVFFlat::~RaftIndexIVFFlat() {
     RaftIndexIVFFlat::reset();
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index cd97f426df..d9b6e498ad 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -22,9 +22,6 @@ struct IndexIVFFlat;
 namespace faiss {
 namespace gpu {
 
-class RaftIVFFlat;
-class GpuIndexFlat;
-
 /// Wrapper around the GPU implementation that looks like
 /// faiss::gpu::GpuIndexIVFFlat
 class RaftIndexIVFFlat : public GpuIndexIVFFlat {
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/raft/RaftIndexIVFPQ.cu
new file mode 100644
index 0000000000..8620ec8e1f
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexIVFPQ.cu
@@ -0,0 +1,396 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/raft/RaftIndexIVFPQ.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/utils/utils.h>
+#include <faiss/gpu/impl/IVFPQ.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
+#include <raft/spatial/knn/ivf_pq.cuh>
+#include <raft/spatial/knn/ivf_pq_types.hpp>
+
+#include <limits>
+
+namespace faiss {
+namespace gpu {
+/**
+ *     GpuIndexIVFPQ(
+            GpuResourcesProvider* provider,
+            int dims,
+            int nlist,
+            int subQuantizers,
+            int bitsPerCode,
+            faiss::MetricType metric,
+            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig())
+ * @param provider
+ * @param index
+ * @param config
+ */
+RaftIndexIVFPQ::RaftIndexIVFPQ(
+        GpuResourcesProvider* provider,
+        const faiss::IndexIVFPQ* index,
+        GpuIndexIVFPQConfig config)
+        : GpuIndexIVFPQ(provider, index, config),
+          pq(index->pq),
+          ivfpqConfig_(config),
+          usePrecomputedTables_(config.usePrecomputedTables),
+          subQuantizers_(0),
+          bitsPerCode_(0),
+          reserveMemoryVecs_(0) {
+    copyFrom(index);
+}
+
+RaftIndexIVFPQ::RaftIndexIVFPQ(
+        GpuResourcesProvider* provider,
+        int dims,
+        int nlist,
+        int subQuantizers,
+        int bitsPerCode,
+        faiss::MetricType metric,
+        GpuIndexIVFPQConfig config)
+        : GpuIndexIVFPQ(provider, dims, nlist, subQuantizers, bitsPerCode,  metric, config),
+          pq(dims, subQuantizers, bitsPerCode),
+          ivfpqConfig_(config),
+          usePrecomputedTables_(config.usePrecomputedTables),
+          subQuantizers_(subQuantizers),
+          bitsPerCode_(bitsPerCode),
+          reserveMemoryVecs_(0) {
+    verifySettings_();
+
+    // We haven't trained ourselves, so don't construct the PQ index yet
+    this->is_trained = false;
+}
+
+RaftIndexIVFPQ::~RaftIndexIVFPQ() {}
+
+void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
+//    DeviceScope scope(config_.device);
+//
+//    GpuIndexIVF::copyFrom(index);
+//
+//    // Clear out our old data
+//    index_.reset();
+//
+//    pq = index->pq;
+//    subQuantizers_ = index->pq.M;
+//    bitsPerCode_ = index->pq.nbits;
+//
+//    // We only support this
+//    FAISS_THROW_IF_NOT_MSG(
+//            ivfpqConfig_.interleavedLayout || index->pq.nbits == 8,
+//            "GPU: only pq.nbits == 8 is supported");
+//    FAISS_THROW_IF_NOT_MSG(
+//            index->by_residual, "GPU: only by_residual = true is supported");
+//    FAISS_THROW_IF_NOT_MSG(
+//            index->polysemous_ht == 0, "GPU: polysemous codes not supported");
+//
+//    verifySettings_();
+//
+//    // The other index might not be trained
+//    if (!index->is_trained) {
+//        // copied in GpuIndex::copyFrom
+//        FAISS_ASSERT(!is_trained);
+//        return;
+//    }
+//
+//    // Copy our lists as well
+//    // The product quantizer must have data in it
+//    FAISS_ASSERT(index->pq.centroids.size() > 0);
+//    index_.reset(new IVFPQ(
+//            resources_.get(),
+//            index->metric_type,
+//            index->metric_arg,
+//            quantizer->getGpuData(),
+//            subQuantizers_,
+//            bitsPerCode_,
+//            ivfpqConfig_.useFloat16LookupTables,
+//            ivfpqConfig_.useMMCodeDistance,
+//            ivfpqConfig_.interleavedLayout,
+//            (float*)index->pq.centroids.data(),
+//            ivfpqConfig_.indicesOptions,
+//            config_.memorySpace));
+//    // Doesn't make sense to reserve memory here
+//    index_->setPrecomputedCodes(usePrecomputedTables_);
+//
+//    // Copy all of the IVF data
+//    index_->copyInvertedListsFrom(index->invlists);
+}
+
+void RaftIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
+//    DeviceScope scope(config_.device);
+//
+//    // We must have the indices in order to copy to ourselves
+//    FAISS_THROW_IF_NOT_MSG(
+//            ivfpqConfig_.indicesOptions != INDICES_IVF,
+//            "Cannot copy to CPU as GPU index doesn't retain "
+//            "indices (INDICES_IVF)");
+//
+//    GpuIndexIVF::copyTo(index);
+//
+//    //
+//    // IndexIVFPQ information
+//    //
+//    index->by_residual = true;
+//    index->use_precomputed_table = 0;
+//    index->code_size = subQuantizers_;
+//    index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
+//
+//    index->do_polysemous_training = false;
+//    index->polysemous_training = nullptr;
+//
+//    index->scan_table_threshold = 0;
+//    index->max_codes = 0;
+//    index->polysemous_ht = 0;
+//    index->precomputed_table.clear();
+//
+//    auto ivf = new ArrayInvertedLists(nlist, index->code_size);
+//    index->replace_invlists(ivf, true);
+//
+//    if (index_) {
+//        // Copy IVF lists
+//        index_->copyInvertedListsTo(ivf);
+//
+//        // Copy PQ centroids
+//        auto devPQCentroids = index_->getPQCentroids();
+//        index->pq.centroids.resize(devPQCentroids.numElements());
+//
+//        fromDevice<float, 3>(
+//                devPQCentroids,
+//                index->pq.centroids.data(),
+//                resources_->getDefaultStream(config_.device));
+//
+//        if (usePrecomputedTables_) {
+//            index->precompute_table();
+//        }
+//    }
+}
+
+void RaftIndexIVFPQ::reserveMemory(size_t numVecs) {
+    reserveMemoryVecs_ = numVecs;
+    if (index_) {
+        DeviceScope scope(config_.device);
+        index_->reserveMemory(numVecs);
+    }
+}
+
+void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) {
+    usePrecomputedTables_ = enable;
+    if (index_) {
+        DeviceScope scope(config_.device);
+        index_->setPrecomputedCodes(enable);
+    }
+
+    verifySettings_();
+}
+
+bool RaftIndexIVFPQ::getPrecomputedCodes() const {
+    return usePrecomputedTables_;
+}
+
+int RaftIndexIVFPQ::getNumSubQuantizers() const {
+    return subQuantizers_;
+}
+
+int RaftIndexIVFPQ::getBitsPerCode() const {
+    return bitsPerCode_;
+}
+
+int RaftIndexIVFPQ::getCentroidsPerSubQuantizer() const {
+    return utils::pow2(bitsPerCode_);
+}
+
+size_t RaftIndexIVFPQ::reclaimMemory() {
+    if (index_) {
+        DeviceScope scope(config_.device);
+        return index_->reclaimMemory();
+    }
+
+    return 0;
+}
+
+void RaftIndexIVFPQ::reset() {
+    if (raft_knn_index.has_value()) {
+        raft_knn_index.reset();
+        this->ntotal = 0;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
+void RaftIndexIVFPQ::train(Index::idx_t n, const float* x) {
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+            "RaftIndexIVFFlat::train (%ld)", n);
+
+    std::cout << "Calling train() with " << n << " rows" << std::endl;
+
+    uint32_t start = raft::curTimeMillis();
+    if (this->is_trained) {
+        FAISS_ASSERT(raft_knn_index.has_value());
+        return;
+    }
+
+    raft::spatial::knn::ivf_pq::index_params raft_idx_params;
+    raft_idx_params.n_lists = nlist;
+    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+    raft_idx_params.add_data_on_build = false;
+    raft_idx_params.kmeans_n_iters = 100;
+
+    raft_knn_index.emplace(
+            raft::spatial::knn::ivf_pq::build(raft_handle, raft_idx_params,
+                                                const_cast<float*>(x),
+                                                n, (faiss::Index::idx_t)d));
+
+    raft_handle.sync_stream();
+    uint32_t stop = raft::curTimeMillis();
+
+    std::cout << "train took " << (stop - start) << "ms. " << std::endl;
+    this->is_trained = true;
+}
+
+void RaftIndexIVFPQ::addImpl_(int n, const float* x, const Index::idx_t* xids) {
+    // Device is already set in GpuIndex::add
+    FAISS_ASSERT(is_trained);
+    FAISS_ASSERT(n > 0);
+
+    // but keep the ntotal based on the total number of vectors that we
+    // attempted to add
+    std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
+
+    raft_knn_index.emplace(raft::spatial::knn::ivf_pq::extend(
+            raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n));
+    this->ntotal += n;
+
+    ntotal += n;
+}
+
+void RaftIndexIVFPQ::searchImpl_(
+        int n,
+        const float* x,
+        int k,
+        float* distances,
+        Index::idx_t* labels) const {
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(index_);
+    FAISS_ASSERT(n > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+
+    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+            "RaftIndexIVFFlat::searchImpl_ (%ld)", n);
+
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(n > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+
+    raft::spatial::knn::ivf_pq::search_params pams;
+    pams.n_probes = nprobe;
+    raft::spatial::knn::ivf_pq::search<float, faiss::Index::idx_t>(
+            raft_handle,
+            pams,
+            *raft_knn_index,
+            const_cast<float*>(x),
+            static_cast<std::uint32_t>(n),
+            static_cast<std::uint32_t>(k),
+            labels,
+            distances);
+
+    raft_handle.sync_stream();
+}
+
+int RaftIndexIVFPQ::getListLength(int listId) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListLength(listId);
+}
+
+std::vector<uint8_t> RaftIndexIVFPQ::getListVectorData(
+        int listId,
+        bool gpuFormat) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListVectorData(listId, gpuFormat);
+}
+
+std::vector<Index::idx_t> RaftIndexIVFPQ::getListIndices(int listId) const {
+    FAISS_ASSERT(index_);
+    DeviceScope scope(config_.device);
+
+    return index_->getListIndices(listId);
+}
+
+void RaftIndexIVFPQ::verifySettings_() const {
+    // Our implementation has these restrictions:
+
+    // Must have some number of lists
+    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
+
+    // up to a single byte per code
+    if (ivfpqConfig_.interleavedLayout) {
+        FAISS_THROW_IF_NOT_FMT(
+                bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 ||
+                bitsPerCode_ == 8,
+                "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
+                bitsPerCode_);
+
+    } else {
+        FAISS_THROW_IF_NOT_FMT(
+                bitsPerCode_ == 8,
+                "Bits per code must be 8 (passed %d)",
+                bitsPerCode_);
+    }
+
+    // Sub-quantizers must evenly divide dimensions available
+    FAISS_THROW_IF_NOT_FMT(
+            this->d % subQuantizers_ == 0,
+            "Number of sub-quantizers (%d) must be an "
+            "even divisor of the number of dimensions (%d)",
+            subQuantizers_,
+            this->d);
+
+    // The number of bytes per encoded vector must be one we support
+    FAISS_THROW_IF_NOT_FMT(
+            ivfpqConfig_.interleavedLayout ||
+            IVFPQ::isSupportedPQCodeLength(subQuantizers_),
+            "Number of bytes per encoded vector / sub-quantizers (%d) "
+            "is not supported",
+            subQuantizers_);
+
+    // We must have enough shared memory on the current device to store
+    // our lookup distances
+    int lookupTableSize = sizeof(float);
+    if (ivfpqConfig_.useFloat16LookupTables) {
+        lookupTableSize = sizeof(half);
+    }
+
+    // 64 bytes per code is only supported with usage of float16, at 2^8
+    // codes per subquantizer
+    size_t requiredSmemSize =
+            lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
+    size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
+
+    FAISS_THROW_IF_NOT_FMT(
+            requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
+            "Device %d has %zu bytes of shared memory, while "
+            "%d bits per code and %d sub-quantizers requires %zu "
+            "bytes. Consider useFloat16LookupTables and/or "
+            "reduce parameters",
+            config_.device,
+            smemPerBlock,
+            bitsPerCode_,
+            subQuantizers_,
+            requiredSmemSize);
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/raft/RaftIndexIVFPQ.h
new file mode 100644
index 0000000000..a121681c2c
--- /dev/null
+++ b/faiss/gpu/raft/RaftIndexIVFPQ.h
@@ -0,0 +1,152 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/impl/ProductQuantizer.h>
+
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ivf_pq_types.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace faiss {
+struct IndexIVFPQ;
+}
+
+namespace faiss {
+namespace gpu {
+
+class GpuIndexFlat;
+class IVFPQ;
+
+/// RAFT IVFPQ index for the GPU
+class RaftIndexIVFPQ : public GpuIndexIVFPQ {
+   public:
+    /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying
+    /// data over to the given GPU, if the input index is trained.
+    RaftIndexIVFPQ(
+            GpuResourcesProvider* provider,
+            const faiss::IndexIVFPQ* index,
+            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
+
+    /// Construct an empty index
+    RaftIndexIVFPQ(
+            GpuResourcesProvider* provider,
+            int dims,
+            int nlist,
+            int subQuantizers,
+            int bitsPerCode,
+            faiss::MetricType metric,
+            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
+
+    ~RaftIndexIVFPQ() override;
+
+    /// Reserve space on the GPU for the inverted lists for `num`
+    /// vectors, assumed equally distributed among
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexIVFPQ* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexIVFPQ* index) const;
+
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(size_t numVecs);
+
+    /// Enable or disable pre-computed codes
+    void setPrecomputedCodes(bool enable);
+
+    /// Are pre-computed codes enabled?
+    bool getPrecomputedCodes() const;
+
+    /// Return the number of sub-quantizers we are using
+    int getNumSubQuantizers() const;
+
+    /// Return the number of bits per PQ code
+    int getBitsPerCode() const;
+
+    /// Return the number of centroids per PQ code (2^bits per code)
+    int getCentroidsPerSubQuantizer() const;
+
+    /// After adding vectors, one can call this to reclaim device memory
+    /// to exactly the amount needed. Returns space reclaimed in bytes
+    size_t reclaimMemory();
+
+    /// Clears out all inverted lists, but retains the coarse and
+    /// product centroid information
+    void reset() override;
+
+    /// Trains the coarse and product quantizer based on the given vector data
+    void train(Index::idx_t n, const float* x) override;
+
+    /// Returns the number of vectors present in a particular inverted list
+    int getListLength(int listId) const override;
+
+    /// Return the encoded vector data contained in a particular inverted list,
+    /// for debugging purposes.
+    /// If gpuFormat is true, the data is returned as it is encoded in the
+    /// GPU-side representation.
+    /// Otherwise, it is converted to the CPU format.
+    /// compliant format, while the native GPU format may differ.
+    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
+    const override;
+
+    /// Return the vector indices contained in a particular inverted list, for
+    /// debugging purposes.
+    std::vector<Index::idx_t> getListIndices(int listId) const override;
+
+   public:
+    /// Like the CPU version, we expose a publically-visible ProductQuantizer
+    /// for manipulation
+    ProductQuantizer pq;
+
+   protected:
+    /// Called from GpuIndex for add/add_with_ids
+    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
+    /// Throws errors if configuration settings are improper
+    void verifySettings_() const;
+
+   protected:
+    /// Our configuration options that we were initialized with
+    const GpuIndexIVFPQConfig ivfpqConfig_;
+
+    /// Runtime override: whether or not we use precomputed tables
+    bool usePrecomputedTables_;
+
+    /// Number of sub-quantizers per encoded vector
+    int subQuantizers_;
+
+    /// Bits per sub-quantizer code
+    int bitsPerCode_;
+
+    /// Desired inverted list memory reservation
+    size_t reserveMemoryVecs_;
+
+    /// The product quantizer instance that we own; contains the
+    /// inverted lists
+    std::unique_ptr<IVFPQ> index_;
+
+    const raft::handle_t raft_handle;
+    std::optional<raft::spatial::knn::ivf_pq::index<Index::idx_t>> raft_knn_index{std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index d56246860f..8d784c0593 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -138,6 +138,7 @@ void queryTest(
 
         std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
 
+        printf("Creating rmm resources\n");
         faiss::gpu::RmmGpuResources res;
         res.noTempMemory();
 
@@ -151,13 +152,20 @@ void queryTest(
         // and the RAFT indexes. We will probably want to perform a bfknn as
         // ground truth and then compare the recall for both the RAFT and FAISS
         // indices.
-        raft::handle_t raft_handle;
 
+        printf("Building raft index\n");
         faiss::gpu::RaftIndexIVFFlat raftIndex(
                 &res, opt.dim, opt.numCentroids, metricType, config);
 
-//        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-//                &res, opt.dim, opt.numCentroids, metricType, config);
+        printf("Done.\n");
+
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, opt.dim, opt.numCentroids, metricType, config);
+
+
+        printf("Creating raft handle\n");
+        raft::handle_t raft_handle;
+        printf("Done\n");
 
         std::cout << "Training raft index" << std::endl;
         uint32_t r_train_start = raft::curTimeMillis();
@@ -166,22 +174,22 @@ void queryTest(
         uint32_t r_train_stop = raft::curTimeMillis();
         std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
 
-//        std::cout << "Training gpu index" << std::endl;
-//        uint32_t g_train_start = raft::curTimeMillis();
-//        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
-//        raft_handle.sync_stream();
-//        uint32_t g_train_stop = raft::curTimeMillis();
-//        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
+        std::cout << "Training gpu index" << std::endl;
+        uint32_t g_train_start = raft::curTimeMillis();
+        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
+        raft_handle.sync_stream();
+        uint32_t g_train_stop = raft::curTimeMillis();
+        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
 
-//        std::cout << "Computing ground truth" << std::endl;
-//        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-//        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-//
-//        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
-//
-//        std::cout << "Done." << std::endl;
-//        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
-//        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
+        std::cout << "Computing ground truth" << std::endl;
+        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
+
+        std::cout << "Done." << std::endl;
+        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
 
         rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
@@ -201,18 +209,18 @@ void queryTest(
         rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
         rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
 
-//        uint32_t gstart = raft::curTimeMillis();
-//        gpuIndex.search(
-//                opt.numQuery,
-//                queryVecs.data(),
-//                opt.k,
-//                gpu_dists.data(),
-//                gpu_inds.data());
-//
-//        raft_handle.sync_stream();
-//        uint32_t gstop = raft::curTimeMillis();
-//
-//        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
+        uint32_t gstart = raft::curTimeMillis();
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                gpu_dists.data(),
+                gpu_inds.data());
+
+        raft_handle.sync_stream();
+        uint32_t gstop = raft::curTimeMillis();
+
+        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
 
         // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
 

From 410b2c654e19a2ade5d314c35cadd7c108a85336 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 10 Oct 2022 16:34:18 -0400
Subject: [PATCH 26/87] Updates

---
 CMakeLists.txt                                |   9 +-
 .../thirdparty/fetch_rapids.cmake             |  12 +-
 cmake/thirdparty/get_raft.cmake               |   4 +-
 faiss/gpu/test/TestRaftIndexIVFPQ.cpp         | 704 ++++++++++++++++++
 4 files changed, 718 insertions(+), 11 deletions(-)
 rename fetch_rapids.cmake => cmake/thirdparty/fetch_rapids.cmake (69%)
 create mode 100644 faiss/gpu/test/TestRaftIndexIVFPQ.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7776821c7a..ded2d8635a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,11 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
-        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
-include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+if(FAISS_ENABLE_RAFT)
+include(cmake/thirdparty/fetch_rapids.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-cuda)
@@ -16,7 +15,7 @@ include(rapids-export)
 include(rapids-find)
 
 rapids_cuda_init_architectures(faiss)
-
+endif()
 
 project(faiss
   VERSION 1.7.2
diff --git a/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
similarity index 69%
rename from fetch_rapids.cmake
rename to cmake/thirdparty/fetch_rapids.cmake
index 0589dc9ddb..0befc2fd5d 100644
--- a/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -11,7 +11,11 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
-        ${CMAKE_BINARY_DIR}/RAPIDS.cmake
-        )
-include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+
+set(RAPIDS_VERSION "22.10")
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
+            ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 5a06fa1ae7..91f53b0f4d 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -15,9 +15,9 @@
 #=============================================================================
 
 
-set(RAFT_VERSION "22.10")
+set(RAFT_VERSION "${RAPIDS_VERSION}")
 set(RAFT_FORK "rapidsai")
-set(RAFT_PINNED_TAG "branch-22.10")
+set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
diff --git a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp
new file mode 100644
index 0000000000..61a3c8870e
--- /dev/null
+++ b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp
@@ -0,0 +1,704 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/gpu/raft/RaftIndexIVFPQ.h>
+#include <faiss/gpu/raft/RmmGpuResources.hpp>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/nvtx.hpp>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <sstream>
+#include <vector>
+
+// FIXME: figure out a better way to test fp16
+constexpr float kF16MaxRelErr = 0.3f;
+constexpr float kF32MaxRelErr = 0.03f;
+
+struct Options {
+    Options() {
+        numAdd = 2 * faiss::gpu::randVal(50000, 70000);
+        dim = faiss::gpu::randVal(64, 200);
+
+        numCentroids = std::sqrt((float)numAdd / 2);
+        numTrain = numCentroids * 50;
+        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
+        numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100);
+
+        // Due to the approximate nature of the query and of floating point
+        // differences between GPU and CPU, to stay within our error bounds,
+        // only use a small k
+        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
+        indicesOpt = faiss::gpu::randSelect(
+                {faiss::gpu::INDICES_CPU,
+                 faiss::gpu::INDICES_32_BIT,
+                 faiss::gpu::INDICES_64_BIT});
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
+            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
+            << " numQuery " << numQuery << " k " << k << " indicesOpt "
+            << indicesOpt;
+
+        return str.str();
+    }
+
+    int numAdd;
+    int dim;
+    int numCentroids;
+    int numTrain;
+    int nprobe;
+    int numQuery;
+    int k;
+    int device;
+    faiss::gpu::IndicesOptions indicesOpt;
+};
+
+template<typename idx_type>
+void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
+
+    uint32_t train_start = raft::curTimeMillis();
+    index.train(opt.numTrain, trainVecs.data());
+    raft_handle.sync_stream();
+    uint32_t train_stop = raft::curTimeMillis();
+
+    uint32_t add_start = raft::curTimeMillis();
+    index.add(opt.numAdd, addVecs.data());
+    raft_handle.sync_stream();
+    uint32_t add_stop = raft::curTimeMillis();
+//    index.train(opt.numTrain, trainVecs.data());
+    index.setNumProbes(opt.nprobe);
+
+    std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl;
+}
+
+
+void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m,
+                  std::vector<float> &addVecs, std::vector<float> &queryVecs) {
+
+
+
+    faiss::gpu::RmmGpuResources gpu_res;
+    gpu_res.setDefaultStream(opt.device, raft_handle.get_stream());
+
+    rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
+    raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
+
+    rmm::device_uvector<float> queryVecsDev(queryVecs.size(), raft_handle.get_stream());
+    raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream());
+
+    faiss::gpu::GpuDistanceParams args;
+    args.metric          = m;
+    args.k               = opt.k;
+    args.dims            = opt.dim;
+    args.vectors         = addVecs.data();
+    args.vectorsRowMajor = true;
+    args.numVectors      = opt.numAdd;
+    args.queries         = queryVecs.data();
+    args.queriesRowMajor = true;
+    args.numQueries      = opt.numQuery;
+    args.outDistances    = dists;
+    args.outIndices      = inds;
+    args.outIndicesType  = faiss::gpu::IndicesDataType::I64;
+
+    /**
+     * @todo: Until FAISS supports pluggable allocation strategies,
+     * we will not reap the benefits of the pool allocator for
+     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+     */
+    bfKnn(&gpu_res, args);
+}
+
+void queryTest(
+        faiss::MetricType metricType,
+        bool useFloat16CoarseQuantizer,
+        int dimOverride = -1) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+        std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
+
+        printf("Creating rmm resources\n");
+        faiss::gpu::RmmGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexIVFPQConfig config;
+        config.device = opt.device;
+        config.indicesOptions = opt.indicesOpt;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+        // TODO: Since we are modifying the centroids when adding new vectors,
+        // the neighbors are no longer going to match completely between CPU
+        // and the RAFT indexes. We will probably want to perform a bfknn as
+        // ground truth and then compare the recall for both the RAFT and FAISS
+        // indices.
+
+        printf("Building raft index\n");
+        faiss::gpu::RaftIndexIVFPQ raftIndex(
+                &res, opt.dim, opt.numCentroids, metricType, config);
+
+        printf("Done.\n");
+
+        faiss::gpu::GpuIndexIVFPQ gpuIndex(
+                &res, opt.dim, opt.numCentroids, metricType, config);
+
+
+        printf("Creating raft handle\n");
+        raft::handle_t raft_handle;
+        printf("Done\n");
+
+        std::cout << "Training raft index" << std::endl;
+        uint32_t r_train_start = raft::curTimeMillis();
+        train_index(raft_handle, opt, raftIndex, trainVecs, addVecs);
+        raft_handle.sync_stream();
+        uint32_t r_train_stop = raft::curTimeMillis();
+        std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
+
+        std::cout << "Training gpu index" << std::endl;
+        uint32_t g_train_start = raft::curTimeMillis();
+        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
+        raft_handle.sync_stream();
+        uint32_t g_train_stop = raft::curTimeMillis();
+        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
+
+        std::cout << "Computing ground truth" << std::endl;
+        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
+
+        std::cout << "Done." << std::endl;
+        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
+
+        rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        uint32_t rstart = raft::curTimeMillis();
+        raftIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                raft_dists.data(),
+                raft_inds.data());
+
+        raft_handle.sync_stream();
+        uint32_t rstop = raft::curTimeMillis();
+        std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
+
+        rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
+        rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
+
+        uint32_t gstart = raft::curTimeMillis();
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                gpu_dists.data(),
+                gpu_inds.data());
+
+        raft_handle.sync_stream();
+        uint32_t gstop = raft::curTimeMillis();
+
+        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
+
+        // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
+
+        raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout);
+        raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout);
+
+//        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
+//        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
+
+//
+//        bool compFloat16 = useFloat16CoarseQuantizer;
+//        faiss::gpu::compareIndices(
+//                cpuIndex,
+//                gpuIndex,
+//                opt.numQuery,
+//                opt.dim,
+//                opt.k,
+//                opt.toString(),
+//                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+//                // FIXME: the fp16 bounds are
+//                // useless when math (the accumulator) is
+//                // in fp16. Figure out another way to test
+//                compFloat16 ? 0.70f : 0.1f,
+//                compFloat16 ? 0.65f : 0.015f);
+    }
+}
+
+void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        faiss::IndexFlatL2 quantizerL2(opt.dim);
+        faiss::IndexFlatIP quantizerIP(opt.dim);
+        faiss::Index* quantizer = metricType == faiss::METRIC_L2
+                                  ? (faiss::Index*)&quantizerL2
+                                  : (faiss::Index*)&quantizerIP;
+
+        faiss::IndexIVFFlat cpuIndex(
+                quantizer, opt.dim, opt.numCentroids, metricType);
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.nprobe = opt.nprobe;
+
+        faiss::gpu::RmmGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexIVFPQConfig config;
+        config.device = opt.device;
+        config.indicesOptions = opt.indicesOpt;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+        faiss::gpu::RaftIndexIVFPQ gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.setNumProbes(opt.nprobe);
+
+        cpuIndex.add(opt.numAdd, addVecs.data());
+        gpuIndex.add(opt.numAdd, addVecs.data());
+
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.30f : 0.015f);
+    }
+}
+
+void copyToTest(bool useFloat16CoarseQuantizer) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::RaftIndexIVFPQ gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    // use garbage values to see if we overwrite then
+    faiss::IndexFlatL2 cpuQuantizer(1);
+    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
+    cpuIndex.nprobe = 1;
+
+    gpuIndex.copyTo(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
+}
+
+void copyFromTest(bool useFloat16CoarseQuantizer) {
+    Options opt;
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    // use garbage values to see if we overwrite then
+    faiss::gpu::RmmGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+
+    faiss::gpu::RaftIndexIVFPQ gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
+    gpuIndex.setNumProbes(1);
+
+    gpuIndex.copyFrom(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
+}
+
+//TEST(TestRaftIndexIVFPQ, Float32_32_Add_L2) {
+//    addTest(faiss::METRIC_L2, false);
+//    printf("Finished addTest(faiss::METRIC_L2, false)\n");
+//}
+//
+//TEST(TestRaftIndexIVFPQ, Float32_32_Add_IP) {
+//    addTest(faiss::METRIC_INNER_PRODUCT, false);
+//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
+//}
+//
+//TEST(TestRaftIndexIVFPQ, Float16_32_Add_L2) {
+//    addTest(faiss::METRIC_L2, true);
+//    printf("Finished addTest(faiss::METRIC_L2, true)\n");
+//}
+//
+//TEST(TestRaftIndexIVFPQ, Float16_32_Add_IP) {
+//    addTest(faiss::METRIC_INNER_PRODUCT, true);
+//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
+//}
+
+//
+// General query tests
+//
+
+TEST(TestRaftIndexIVFPQ, Float32_Query_L2) {
+queryTest(faiss::METRIC_L2, false);
+printf("Finished queryTest(faiss::METRIC_L2, false);\n");
+}
+
+//TEST(TestRaftIndexIVFPQ, Float32_Query_IP) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
+//}
+
+// float16 coarse quantizer
+
+TEST(TestRaftIndexIVFPQ, Float16_32_Query_L2) {
+queryTest(faiss::METRIC_L2, true);
+printf("Finished queryTest(faiss::METRIC_L2, true)\n");
+}
+
+//TEST(TestRaftIndexIVFPQ, Float16_32_Query_IP) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, true);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
+//}
+
+//
+// There are IVF list scanning specializations for 64-d and 128-d that we
+// make sure we explicitly test here
+//
+
+TEST(TestRaftIndexIVFPQ, Float32_Query_L2_64) {
+queryTest(faiss::METRIC_L2, false, 64);
+printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n");
+}
+
+//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_64) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
+//}
+
+TEST(TestRaftIndexIVFPQ, Float32_Query_L2_128) {
+queryTest(faiss::METRIC_L2, false, 128);
+printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n");
+}
+
+//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_128) {
+//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
+//}
+
+//
+// Copy tests
+//
+
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFPQ, Float32_32_CopyTo) {
+//     copyToTest(false);
+//     printf("Finished copyToTest(false)\n");
+// }
+
+//TEST(TestRaftIndexIVFPQ, Float32_32_CopyFrom) {
+//    copyFromTest(false);
+//    printf("Finished copyFromTest(false)\n");
+//}
+
+//TEST(TestRaftIndexIVFPQ, Float32_negative) {
+//    Options opt;
+//
+//    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+//    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+//
+//    // Put all vecs on negative side
+//    for (auto& f : trainVecs) {
+//        f = std::abs(f) * -1.0f;
+//    }
+//
+//    for (auto& f : addVecs) {
+//        f *= std::abs(f) * -1.0f;
+//    }
+//
+//    faiss::IndexFlatIP quantizerIP(opt.dim);
+//    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
+//
+//    faiss::IndexIVFFlat cpuIndex(
+//            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
+//    cpuIndex.train(opt.numTrain, trainVecs.data());
+//    cpuIndex.add(opt.numAdd, addVecs.data());
+//    cpuIndex.nprobe = opt.nprobe;
+//
+//    faiss::gpu::RmmGpuResources res;
+//    res.noTempMemory();
+//
+//    faiss::gpu::GpuIndexIVFPQConfig config;
+//    config.device = opt.device;
+//    config.indicesOptions = opt.indicesOpt;
+//
+//    faiss::gpu::RaftIndexIVFPQ gpuIndex(
+//            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+//    gpuIndex.copyFrom(&cpuIndex);
+//    gpuIndex.setNumProbes(opt.nprobe);
+//
+//    // Construct a positive test set
+//    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+//
+//    // Put all vecs on positive size
+//    for (auto& f : queryVecs) {
+//        f = std::abs(f);
+//    }
+//
+//    bool compFloat16 = false;
+//    faiss::gpu::compareIndices(
+//            queryVecs,
+//            cpuIndex,
+//            gpuIndex,
+//            opt.numQuery,
+//            opt.dim,
+//            opt.k,
+//            opt.toString(),
+//            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+//            // FIXME: the fp16 bounds are
+//            // useless when math (the accumulator) is
+//            // in fp16. Figure out another way to test
+//            compFloat16 ? 0.99f : 0.1f,
+//            compFloat16 ? 0.65f : 0.015f);
+//}
+
+//
+// NaN tests
+//
+
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFPQ, QueryNaN) {
+//     Options opt;
+
+//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
+//     opt.dim); std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd,
+//     opt.dim);
+
+//     faiss::gpu::RmmGpuResources res;
+//     res.noTempMemory();
+
+//     faiss::gpu::GpuIndexIVFPQConfig config;
+//     config.device = opt.device;
+//     config.indicesOptions = opt.indicesOpt;
+//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+//     faiss::gpu::RaftIndexIVFPQ gpuIndex(
+//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+//     gpuIndex.setNumProbes(opt.nprobe);
+
+//     gpuIndex.train(opt.numTrain, trainVecs.data());
+//     gpuIndex.add(opt.numAdd, addVecs.data());
+
+//     int numQuery = 10;
+//     std::vector<float> nans(
+//             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+//     std::vector<float> distances(numQuery * opt.k, 0);
+//     std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+
+//     gpuIndex.search(
+//             numQuery, nans.data(), opt.k, distances.data(), indices.data());
+
+//     for (int q = 0; q < numQuery; ++q) {
+//         for (int k = 0; k < opt.k; ++k) {
+//             EXPECT_EQ(indices[q * opt.k + k], -1);
+//             EXPECT_EQ(
+//                     distances[q * opt.k + k],
+//                     std::numeric_limits<float>::max());
+//         }
+//     }
+// }
+
+/** TODO: test crashes */
+// TEST(TestRaftIndexIVFPQ, AddNaN) {
+//     Options opt;
+
+//     faiss::gpu::RmmGpuResources res;
+//     res.noTempMemory();
+
+//     faiss::gpu::GpuIndexIVFPQConfig config;
+//     config.device = opt.device;
+//     config.indicesOptions = opt.indicesOpt;
+//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+
+//     faiss::gpu::RaftIndexIVFPQ gpuIndex(
+//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+//     gpuIndex.setNumProbes(opt.nprobe);
+
+//     int numNans = 10;
+//     std::vector<float> nans(
+//             numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+//     // Make one vector valid (not the first vector, in order to test offset
+//     // issues), which should actually add
+//     for (int i = 0; i < opt.dim; ++i) {
+//         nans[opt.dim + i] = i;
+//     }
+
+//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
+//     opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data());
+
+//     // should not crash
+//     EXPECT_EQ(gpuIndex.ntotal, 0);
+//     gpuIndex.add(numNans, nans.data());
+
+//     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery,
+//     opt.dim); std::vector<float> distance(opt.numQuery * opt.k, 0);
+//     std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+
+//     // should not crash
+//     gpuIndex.search(
+//             opt.numQuery,
+//             queryVecs.data(),
+//             opt.k,
+//             distance.data(),
+//             indices.data());
+// }
+
+//TEST(TestRaftIndexIVFPQ, UnifiedMemory) {
+//    // Construct on a random device to test multi-device, if we have
+//    // multiple devices
+//    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+//
+//    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+//        return;
+//    }
+//
+//    int dim = 128;
+//
+//    int numCentroids = 256;
+//    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
+//    // so just perform a small test with data allocated in the unified
+//    // memory address space
+//    size_t numAdd = 10000;
+//    size_t numTrain = numCentroids * 40;
+//    int numQuery = 10;
+//    int k = 10;
+//    int nprobe = 8;
+//
+//    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+//    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+//
+//    faiss::IndexFlatL2 quantizer(dim);
+//    faiss::IndexIVFFlat cpuIndex(
+//            &quantizer, dim, numCentroids, faiss::METRIC_L2);
+//
+//    cpuIndex.train(numTrain, trainVecs.data());
+//    cpuIndex.add(numAdd, addVecs.data());
+//    cpuIndex.nprobe = nprobe;
+//
+//    faiss::gpu::RmmGpuResources res;
+//    res.noTempMemory();
+//
+//    faiss::gpu::GpuIndexIVFPQConfig config;
+//    config.device = device;
+//    config.memorySpace = faiss::gpu::MemorySpace::Unified;
+//
+//    faiss::gpu::RaftIndexIVFPQ gpuIndex(
+//            &res, dim, numCentroids, faiss::METRIC_L2, config);
+//    gpuIndex.copyFrom(&cpuIndex);
+//    gpuIndex.setNumProbes(nprobe);
+//
+//    faiss::gpu::compareIndices(
+//            cpuIndex,
+//            gpuIndex,
+//            numQuery,
+//            dim,
+//            k,
+//            "Unified Memory",
+//            kF32MaxRelErr,
+//            0.1f,
+//            0.015f);
+//}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}

From d7ca6b48711ce53a5edd8fffc119122d520390c0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 10 Oct 2022 16:50:31 -0400
Subject: [PATCH 27/87] Adding FAISS_ENABLE_RAFT option to INSTALL.md

---
 INSTALL.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/INSTALL.md b/INSTALL.md
index 9d928a4ea4..e0b221a812 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -94,6 +94,9 @@ Several options can be passed to CMake, among which:
   values are `ON` and `OFF`),
   - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
   (possible values are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_RAFT=ON` in order to enable building the RAFT implementations
+    of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible 
+    values are `ON` and `OFF`)
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
   - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
   are `ON` and `OFF`),

From 9875dad4071c99f91006e9d23a76030350d10728 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 10 Oct 2022 17:08:26 -0400
Subject: [PATCH 28/87] Making build.sh work for quick building of proposal

---
 CMakeLists.txt |  8 +++++++-
 build.sh       | 19 +++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ded2d8635a..85a5a0e46b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,12 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
+set(FAISS_LANGUAGES CXX)
+
+if(FAISS_ENABLE_GPU)
+  list(APPEND FAISS_LANGUAGES CUDA)
+endif()
+
 if(FAISS_ENABLE_RAFT)
 include(cmake/thirdparty/fetch_rapids.cmake)
 include(rapids-cmake)
@@ -21,7 +27,7 @@ project(faiss
   VERSION 1.7.2
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
-  LANGUAGES CXX)
+  LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
 set(CMAKE_CXX_STANDARD 17)
diff --git a/build.sh b/build.sh
index a37468d665..80341ebcfd 100755
--- a/build.sh
+++ b/build.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
 
 BUILD_TYPE=Release
+BUILD_DIR=build/
 
-RAFT_REPO_REL="/share/workspace/rapids_projects/raft"
-RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
 set -e
 
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
 if [ "$1" == "clean" ]; then
   rm -rf build
   exit 0
@@ -22,14 +27,15 @@ if [ "$1" == "test-raft" ]; then
   exit 0
 fi
 
-mkdir -p build/ && cd build/
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
 cmake \
  -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_RAFT=ON \
  -DFAISS_ENABLE_PYTHON=OFF \
  -DBUILD_TESTING=ON \
  -DBUILD_SHARED_LIBS=OFF \
- -DCPM_raft_SOURCE=${RAFT_REPO_REL} \
- -DFAISS_ENABLE_RAFT=ON \
  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
  -DFAISS_OPT_LEVEL=avx2 \
  -DRAFT_NVTX=OFF \
@@ -38,6 +44,7 @@ cmake \
  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ ${EXTRA_CMAKE_ARGS} \
  ../
 
 cmake  --build . -j12

From c09d09b7d2c00d61b395c9434c645f5c05dbd7bd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 11 Oct 2022 12:52:30 -0400
Subject: [PATCH 29/87] Merging upstream

---
 faiss/gpu/GpuIndexIVF.h            |  8 +++----
 faiss/gpu/GpuIndexIVFFlat.cu       |  2 --
 faiss/gpu/raft/RaftIndexIVFFlat.cu | 37 ++++++++++++++++++++++--------
 faiss/gpu/raft/RaftIndexIVFFlat.h  | 15 +++++++++++-
 faiss/gpu/raft/RaftIndexIVFPQ.cu   | 27 ++++++++++++++++++++--
 faiss/gpu/raft/RaftIndexIVFPQ.h    | 15 +++++++++++-
 6 files changed, 84 insertions(+), 20 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index 4a80cdcb06..a962ebf406 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -75,10 +75,10 @@ class GpuIndexIVF : public GpuIndex {
     virtual void updateQuantizer() = 0;
 
     /// Returns the number of inverted lists we're managing
-    int getNumLists() const;
+    virtual int getNumLists() const;
 
     /// Returns the number of vectors present in a particular inverted list
-    int getListLength(int listId) const;
+    virtual int getListLength(int listId) const;
 
     /// Return the encoded vector data contained in a particular inverted list,
     /// for debugging purposes.
@@ -86,12 +86,12 @@ class GpuIndexIVF : public GpuIndex {
     /// GPU-side representation.
     /// Otherwise, it is converted to the CPU format.
     /// compliant format, while the native GPU format may differ.
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
+    virtual std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
             const;
 
     /// Return the vector indices contained in a particular inverted list, for
     /// debugging purposes.
-    std::vector<Index::idx_t> getListIndices(int listId) const;
+    virtual std::vector<Index::idx_t> getListIndices(int listId) const;
 
     /// Sets the number of list probes per query
     void setNumProbes(int nprobe);
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 17537e7caa..f556241839 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -134,8 +134,6 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
     updateQuantizer();
 
-    raft::print_device_vector("faiss centers", quantizer->getGpuData()->vectors(), 50, std::cout);
-
     // Copy all of the IVF data
     index_->copyInvertedListsFrom(index->invlists);
 }
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu
index fe77aa1d1e..a877e2419d 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu
@@ -5,6 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <faiss/IndexIVF.h> // for SearchParametersIVF
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -45,6 +46,21 @@ RaftIndexIVFFlat::RaftIndexIVFFlat(
     std::cout << "In raft index constructor" << std::endl;
 }
 
+
+RaftIndexIVFFlat::RaftIndexIVFFlat(
+        GpuResourcesProvider* provider,
+        Index *coarse_quantizer,
+        int dims,
+        int nlist,
+        faiss::MetricType metric,
+        GpuIndexIVFFlatConfig config)
+        : GpuIndexIVFFlat(provider, coarse_quantizer, dims, nlist, metric, config),
+          raft_handle(resources_->getDefaultStream(config_.device)) {
+
+    std::cout << "In raft index constructor" << std::endl;
+}
+
+
 RaftIndexIVFFlat::~RaftIndexIVFFlat() {
     RaftIndexIVFFlat::reset();
 }
@@ -53,15 +69,15 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     DeviceScope scope(config_.device);
     GpuIndex::copyFrom(index);
     FAISS_ASSERT(index->nlist > 0);
-    FAISS_THROW_IF_NOT_FMT(
-            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports %zu inverted lists",
-            (size_t)std::numeric_limits<int>::max());
-    FAISS_THROW_IF_NOT_FMT(
-            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
-            "GPU index only supports nprobe <= %zu; passed %zu",
-            (size_t)getMaxKSelection(),
-            index->nprobe);
+//    FAISS_THROW_IF_NOT_FMT(
+//            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
+//            "GPU index only supports %zu inverted lists",
+//            (size_t)std::numeric_limits<int>::max());
+//    FAISS_THROW_IF_NOT_FMT(
+//            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
+//            "GPU index only supports nprobe <= %zu; passed %zu",
+//            (size_t)getMaxKSelection(),
+//            index->nprobe);
 
     /**
      * TODO: Copy centers and center norms from quantizer
@@ -305,7 +321,8 @@ void RaftIndexIVFFlat::searchImpl_(
         const float* x,
         int k,
         float* distances,
-        Index::idx_t* labels) const {
+        Index::idx_t* labels,
+        const SearchParameters *params) const {
 
     raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
             "RaftIndexIVFFlat::searchImpl_ (%ld)", n);
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h
index d9b6e498ad..eaeabafce6 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.h
+++ b/faiss/gpu/raft/RaftIndexIVFFlat.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/IndexIVF.h> // for SearchParametersIVF
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 
@@ -33,6 +34,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             const faiss::IndexIVFFlat* index,
             GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
 
+
     /// Constructs a new instance with an empty flat quantizer; the user
     /// provides the number of lists desired.
     RaftIndexIVFFlat(
@@ -42,6 +44,16 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             faiss::MetricType metric,
             GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
 
+    /// Constructs a new instance with a provided CPU or GPU coarse quantizer;
+    /// the user provides the number of IVF lists desired.
+    RaftIndexIVFFlat(
+            GpuResourcesProvider* provider,
+            Index* coarseQuantizer,
+            int dims,
+            int nlist,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
+
     ~RaftIndexIVFFlat() override;
 
     /// Clears out all inverted lists, but retains the coarse centroid
@@ -87,7 +99,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat {
             const float* x,
             int k,
             float* distances,
-            Index::idx_t* labels) const override;
+            Index::idx_t* labels,
+            const SearchParameters *params) const override;
 
     void rebuildRaftIndex(const float* x, Index::idx_t n_rows);
 
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/raft/RaftIndexIVFPQ.cu
index 8620ec8e1f..a7056ff5ae 100644
--- a/faiss/gpu/raft/RaftIndexIVFPQ.cu
+++ b/faiss/gpu/raft/RaftIndexIVFPQ.cu
@@ -70,6 +70,28 @@ RaftIndexIVFPQ::RaftIndexIVFPQ(
     this->is_trained = false;
 }
 
+RaftIndexIVFPQ::RaftIndexIVFPQ(
+        GpuResourcesProvider* provider,
+        Index *coarse_quantizer,
+        int dims,
+        int nlist,
+        int subQuantizers,
+        int bitsPerCode,
+        faiss::MetricType metric,
+        GpuIndexIVFPQConfig config)
+        : GpuIndexIVFPQ(provider, coarse_quantizer, dims, nlist, subQuantizers, bitsPerCode,  metric, config),
+          pq(dims, subQuantizers, bitsPerCode),
+          ivfpqConfig_(config),
+          usePrecomputedTables_(config.usePrecomputedTables),
+          subQuantizers_(subQuantizers),
+          bitsPerCode_(bitsPerCode),
+          reserveMemoryVecs_(0) {
+    verifySettings_();
+
+    // We haven't trained ourselves, so don't construct the PQ index yet
+    this->is_trained = false;
+}
+
 RaftIndexIVFPQ::~RaftIndexIVFPQ() {}
 
 void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
@@ -186,7 +208,7 @@ void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) {
     usePrecomputedTables_ = enable;
     if (index_) {
         DeviceScope scope(config_.device);
-        index_->setPrecomputedCodes(enable);
+        index_->setPrecomputedCodes(quantizer, enable);
     }
 
     verifySettings_();
@@ -277,7 +299,8 @@ void RaftIndexIVFPQ::searchImpl_(
         const float* x,
         int k,
         float* distances,
-        Index::idx_t* labels) const {
+        Index::idx_t* labels,
+        const SearchParameters *params) const {
     // Device is already set in GpuIndex::search
     FAISS_ASSERT(index_);
     FAISS_ASSERT(n > 0);
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/raft/RaftIndexIVFPQ.h
index a121681c2c..e7f1b7515c 100644
--- a/faiss/gpu/raft/RaftIndexIVFPQ.h
+++ b/faiss/gpu/raft/RaftIndexIVFPQ.h
@@ -46,6 +46,18 @@ class RaftIndexIVFPQ : public GpuIndexIVFPQ {
             faiss::MetricType metric,
             GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
 
+
+    /// Construct an empty index
+    RaftIndexIVFPQ(
+            GpuResourcesProvider* provider,
+            Index *coarse_quantizer,
+            int dims,
+            int nlist,
+            int subQuantizers,
+            int bitsPerCode,
+            faiss::MetricType metric,
+            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
+
     ~RaftIndexIVFPQ() override;
 
     /// Reserve space on the GPU for the inverted lists for `num`
@@ -119,7 +131,8 @@ class RaftIndexIVFPQ : public GpuIndexIVFPQ {
             const float* x,
             int k,
             float* distances,
-            Index::idx_t* labels) const override;
+            Index::idx_t* labels,
+            const SearchParameters *params) const override;
 
     /// Throws errors if configuration settings are improper
     void verifySettings_() const;

From 0081ed9ea7457b9b23107bb7ba18487320c4b528 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 19 Oct 2022 16:14:11 -0400
Subject: [PATCH 30/87] Integrating more deeply with `use_raft` option in the
 index config that branches out to IVFFlat or RaftIVFFlat depending on the
 setting

---
 CMakeLists.txt                                |   4 +
 faiss/gpu/CMakeLists.txt                      |   6 +-
 faiss/gpu/GpuIndex.h                          |   3 +
 faiss/gpu/GpuIndexIVF.cu                      |  27 ++-
 faiss/gpu/GpuIndexIVF.h                       |   2 +-
 faiss/gpu/GpuIndexIVFFlat.cu                  |  72 ++++---
 faiss/gpu/GpuIndexIVFFlat.h                   |  14 ++
 faiss/gpu/GpuResources.cpp                    |   4 +
 faiss/gpu/GpuResources.h                      |   8 +
 faiss/gpu/StandardGpuResources.cpp            |  31 ++++
 faiss/gpu/StandardGpuResources.h              |  20 +-
 faiss/gpu/impl/IVFFlat.cu                     |   3 +-
 faiss/gpu/impl/IVFFlat.cuh                    |  13 +-
 faiss/gpu/impl/raft/RaftIVFFlat.cu            | 175 ++++++++++++++++++
 faiss/gpu/impl/raft/RaftIVFFlat.cuh           |  86 +++++++++
 faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.cu |   2 +-
 faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.h  |   0
 faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.cu   |   2 +-
 faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.h    |   0
 faiss/gpu/{ => impl}/raft/RmmGpuResources.hpp |  20 ++
 20 files changed, 448 insertions(+), 44 deletions(-)
 create mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cu
 create mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cuh
 rename faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.cu (99%)
 rename faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.h (100%)
 rename faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.cu (99%)
 rename faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.h (100%)
 rename faiss/gpu/{ => impl}/raft/RmmGpuResources.hpp (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85a5a0e46b..a0ff1eceb6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,11 @@ project(faiss
   LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
+if(FAISS_ENABLE_RAFT)
 set(CMAKE_CXX_STANDARD 17)
+else()
+set(CMAKE_CXX_STANDARD 11)
+endif()
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index ce72786308..8b373aecb8 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -165,8 +165,10 @@ set(FAISS_GPU_HEADERS
 )
 
 if(FAISS_ENABLE_RAFT)
-  list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIndexIVFPQ.h)
-  list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIndexIVFPQ.cu)
+  list(APPEND FAISS_GPU_HEADERS impl/raft/RaftIndexIVFFlat.h impl/raft/RaftIndexIVFPQ.h
+          impl/raft/RaftIVFFlat.cuh)
+  list(APPEND FAISS_GPU_SRC impl/raft/RaftIndexIVFFlat.cu impl/raft/RaftIndexIVFPQ.cu
+          impl/raft/RaftIVFFlat.cu)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 0f50d491f0..f3b42d0f88 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -23,6 +23,9 @@ struct GpuIndexConfig {
     /// On Pascal and above (CC 6+) architectures, allows GPUs to use
     /// more memory than is available on the GPU.
     MemorySpace memorySpace;
+
+    /// Should the index dispatch down to RAFT?
+    bool use_raft = false;
 };
 
 class GpuIndex : public faiss::Index {
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 6813bdf0b8..ff4eb974b9 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -459,16 +459,29 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
         printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
     }
 
-    // leverage the CPU-side k-means code, which works for the GPU
-    // flat index as well
-    quantizer->reset();
-    Clustering clus(this->d, nlist, this->cp);
-    clus.verbose = verbose;
-    clus.train(n, x, *quantizer);
-    quantizer->is_trained = true;
+    if(config_.use_raft) {
+        /**
+         * TODO: Plug in clustering logic here.
+         *
+         * Essentially what we need here is to use `x` as the training data set
+         * to train the k-means centroids and add them to the quantizer
+         * implementation.
+         */
 
 
 
+
+
+    } else {
+        // leverage the CPU-side k-means code, which works for the GPU
+        // flat index as well
+        quantizer->reset();
+        Clustering clus(this->d, nlist, this->cp);
+        clus.verbose = verbose;
+        clus.train(n, x, *quantizer);
+        quantizer->is_trained = true;
+    }
+
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
 
diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index a962ebf406..aaf5374314 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -128,7 +128,7 @@ class GpuIndexIVF : public GpuIndex {
    protected:
     void verifyIVFSettings_() const;
     bool addImplRequiresIDs_() const override;
-    void trainQuantizer_(Index::idx_t n, const float* x);
+    virtual void trainQuantizer_(Index::idx_t n, const float* x);
 
     /// Called from GpuIndex for add/add_with_ids
     void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index f556241839..642058c4d4 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -6,6 +6,7 @@
  */
 
 #include <raft/core/cudart_utils.hpp>
+#include <faiss/gpu/impl/raft/RaftIVFFlat.cuh>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -70,11 +71,10 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
           reserveMemoryVecs_(0) {
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
+
     if (this->is_trained) {
         FAISS_ASSERT(this->quantizer);
-
-        index_.reset(new IVFFlat(
-                resources_.get(),
+        set_index_(resources_.get(),
                 this->d,
                 this->nlist,
                 this->metric_type,
@@ -83,7 +83,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
                 nullptr, // no scalar quantizer
                 ivfFlatConfig_.interleavedLayout,
                 ivfFlatConfig_.indicesOptions,
-                config_.memorySpace));
+                config_.memorySpace);
         baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
         updateQuantizer();
     }
@@ -91,6 +91,32 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
 
 GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
 
+void GpuIndexIVFFlat::set_index_(GpuResources* resources,
+                                int dim,
+                                int nlist,
+                                faiss::MetricType metric,
+                                float metricArg,
+                                bool useResidual,
+                                /// Optional ScalarQuantizer
+                                faiss::ScalarQuantizer* scalarQ,
+                                bool interleavedLayout,
+                                IndicesOptions indicesOptions,
+                                MemorySpace space) {
+    if(config_.use_raft) {
+        index_.reset(new RaftIVFFlat(
+                resources, dim, nlist, metric, metricArg, useResidual,
+                scalarQ, interleavedLayout, indicesOptions, space));
+    } else {
+        index_.reset(new IVFFlat(
+                resources, dim, nlist, metric, metricArg, useResidual,
+                scalarQ, interleavedLayout, indicesOptions, space));
+    }
+
+    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
+    updateQuantizer();
+
+}
+
 void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
     DeviceScope scope(config_.device);
 
@@ -120,8 +146,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     FAISS_ASSERT(this->is_trained);
 
     // Copy our lists as well
-    index_.reset(new IVFFlat(
-            resources_.get(),
+    set_index_(resources_.get(),
             this->d,
             this->nlist,
             index->metric_type,
@@ -130,9 +155,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             nullptr, // no scalar quantizer
             ivfFlatConfig_.interleavedLayout,
             ivfFlatConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
+            config_.memorySpace);
+
 
     // Copy all of the IVF data
     index_->copyInvertedListsFrom(index->invlists);
@@ -210,8 +234,7 @@ void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
     FAISS_ASSERT(!index_);
 
     // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
+    // First, make sure that the data is resident on the CPU, if it is not on the CPU, as we depend upon parts of the CPU code
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
@@ -220,21 +243,18 @@ void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
     trainQuantizer_(n, hostData.data());
 
     // The quantizer is now trained; construct the IVF index
-    index_.reset(new IVFFlat(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            this->metric_type,
-            this->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
-
-    if (reserveMemoryVecs_) {
+    set_index_(resources_.get(),
+              this->d,
+              this->nlist,
+              this->metric_type,
+              this->metric_arg,
+              false,   // no residual
+              nullptr, // no scalar quantizer
+              ivfFlatConfig_.interleavedLayout,
+              ivfFlatConfig_.indicesOptions,
+              config_.memorySpace);
+
+     if (reserveMemoryVecs_) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
 
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index 4ab6f88ef0..a519f11cc2 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <memory>
 
@@ -88,6 +89,19 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     void train(Index::idx_t n, const float* x) override;
 
    protected:
+
+    void set_index_(GpuResources* resources,
+                   int dim,
+                   int nlist,
+                   faiss::MetricType metric,
+                   float metricArg,
+                   bool useResidual,
+                    /// Optional ScalarQuantizer
+                   faiss::ScalarQuantizer* scalarQ,
+                   bool interleavedLayout,
+                   IndicesOptions indicesOptions,
+                   MemorySpace space);
+
     /// Our configuration options
     const GpuIndexIVFFlatConfig ivfFlatConfig_;
 
diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index b3dca0895d..e964d13155 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -153,6 +153,10 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
     return getDefaultStream(getCurrentDevice());
 }
 
+raft::handle_t &GpuResources::getRaftHandleCurrentDevice() const {
+    return getRaftHandle(getCurrentDevice());
+}
+
 std::vector<cudaStream_t> GpuResources::getAlternateStreamsCurrentDevice() {
     return getAlternateStreams(getCurrentDevice());
 }
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 3ae2dfbe19..8824791f12 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -10,6 +10,8 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <faiss/impl/FaissAssert.h>
+
+#include <raft/core/handle.hpp>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -190,6 +192,12 @@ class GpuResources {
     /// given device
     virtual cudaStream_t getDefaultStream(int device) = 0;
 
+    /// Returns the raft handle for the given device which can be used to
+    /// make calls to other raft primitives.
+    virtual raft::handle_t &getRaftHandle(int device) const;
+
+    raft::handle_t &getRaftHandleCurrentDevice() const;
+
     /// Overrides the default stream for a device to the user-supplied stream.
     /// The resources object does not own this stream (i.e., it will not destroy
     /// it).
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 80146a2e59..6120fd0bc7 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -5,6 +5,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#ifdef FAISS_ENABLE_RAFT
+#include <raft/core/handle.hpp>
+#endif
+
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
@@ -313,6 +317,11 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
+#ifdef FAISS_ENABLE_RAFT
+    raft::handle_t handle(defaultStream);
+    raftHandles_[device] = handle;
+#endif
+
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
             cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
@@ -375,6 +384,22 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
+#ifdef FAISS_ENABLE_RAFT
+raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const {
+    initializeForDevice(device);
+
+    auto it = raftHandles_.find(device);
+    if (it != raftHandles_.end()) {
+        // There is a user override stream set
+        return it->second;
+    }
+
+    // Otherwise, our base default stream
+    return raftHandles_[device];
+
+}
+#endif
+
 std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
         int device) {
     initializeForDevice(device);
@@ -600,6 +625,12 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
+#ifdef FAISS_ENABLE_RAFT
+ raft::handle_t &StandardGpuResources::getRaftHandle(int device) const {
+    return res_->getRaftHandle(device);
+}
+#endif
+
 size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
     return res_->getTempMemoryAvailable(device);
 }
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index d1edfb6673..408221e4b9 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <raft/core/handle.hpp>
+
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StackDeviceMemory.h>
@@ -58,6 +60,12 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// this stream upon exit from an index or other Faiss GPU call.
     cudaStream_t getDefaultStream(int device) override;
 
+#ifdef FAISS_ENABLE_RAFT
+    /// Returns the raft handle for the given device which can be used to
+    /// make calls to other raft primitives.
+    raft::handle_t &getRaftHandle(int device) const override;
+#endif
+
     /// Called to change the work ordering streams to the null stream
     /// for all devices
     void setDefaultNullStreamAllDevices();
@@ -124,6 +132,11 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// cuBLAS handle for each device
     std::unordered_map<int, cublasHandle_t> blasHandles_;
 
+#ifdef FAISS_ENABLE_RAFT
+    /// raft handle for each device
+    std::unordered_map<int, raft::handle_t> raftHandles_;
+#endif
+
     /// Pinned memory allocation for use with this GPU
     void* pinnedMemAlloc_;
     size_t pinnedMemAllocSize_;
@@ -183,10 +196,15 @@ class StandardGpuResources : public GpuResourcesProvider {
     /// Export a description of memory used for Python
     std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo()
             const;
-
     /// Returns the current default stream
     cudaStream_t getDefaultStream(int device);
 
+#ifdef FAISS_ENABLE_RAFT
+    /// Returns the raft handle for the given device which can be used to
+    /// make calls to other raft primitives.
+    raft::handle_t &getRaftHandle(int device) const override;
+#endif
+
     /// Returns the current amount of temp memory available
     size_t getTempMemoryAvailable(int device) const;
 
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 9d7637d3dd..dd8b1c86a2 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -112,6 +112,7 @@ std::vector<uint8_t> IVFFlat::translateCodesFromGpu_(
     return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
 }
 
+
 void IVFFlat::appendVectors_(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfCentroidResiduals,
@@ -126,7 +127,6 @@ void IVFFlat::appendVectors_(
     //
     // Append the new encodings
     //
-
     // Append indices to the IVF lists
     runIVFIndicesAppend(
             listIds,
@@ -197,6 +197,7 @@ void IVFFlat::search(
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe, dim_});
 
+
     searchCoarseQuantizer_(
             coarseQuantizer,
             nprobe,
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 27e931be41..01c8f5cc6d 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -7,6 +7,11 @@
 
 #pragma once
 
+#ifdef FAISS_ENABLE_RAFT
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ivf_flat_types.hpp>
+#endif
+
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFBase.cuh>
 
@@ -60,17 +65,17 @@ class IVFFlat : public IVFBase {
     size_t getCpuVectorsEncodingSize_(int numVecs) const override;
 
     /// Translate to our preferred GPU encoding
-    std::vector<uint8_t> translateCodesToGpu_(
+    virtual std::vector<uint8_t> translateCodesToGpu_(
             std::vector<uint8_t> codes,
             size_t numVecs) const override;
 
     /// Translate from our preferred GPU encoding
-    std::vector<uint8_t> translateCodesFromGpu_(
+    virtual std::vector<uint8_t> translateCodesFromGpu_(
             std::vector<uint8_t> codes,
             size_t numVecs) const override;
 
     /// Encode the vectors that we're adding and append to our IVF lists
-    void appendVectors_(
+    virtual void appendVectors_(
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfCentroidResiduals,
             Tensor<Index::idx_t, 1, true>& indices,
@@ -84,7 +89,7 @@ class IVFFlat : public IVFBase {
 
     /// Shared IVF search implementation, used by both search and
     /// searchPreassigned
-    void searchImpl_(
+    virtual void searchImpl_(
             Tensor<float, 2, true>& queries,
             Tensor<float, 2, true>& coarseDistances,
             Tensor<Index::idx_t, 2, true>& coarseIndices,
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu
new file mode 100644
index 0000000000..17c8581ea8
--- /dev/null
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu
@@ -0,0 +1,175 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ivf_flat.cuh>
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <thrust/host_vector.h>
+#include <faiss/gpu/impl/raft/RaftIVFFlat.cuh>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/IVFInterleaved.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <limits>
+#include <unordered_map>
+
+namespace faiss {
+namespace gpu {
+
+RaftIVFFlat::RaftIVFFlat(
+        GpuResources* res,
+        int dim,
+        int nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space)
+        : IVFFlat(res,
+                  dim,
+                  nlist,
+                  metric,
+                  metricArg,
+                  useResidual,
+                  scalarQ,
+                  interleavedLayout,
+                  indicesOptions,
+                  space){}
+
+RaftIVFFlat::~RaftIVFFlat() {}
+
+size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
+    if (interleavedLayout_) {
+        // bits per scalar code
+        int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
+
+        // bytes to encode a block of 32 vectors (single dimension)
+        int bytesPerDimBlock = bits * 32 / 8;
+
+        // bytes to fully encode 32 vectors
+        int bytesPerBlock = bytesPerDimBlock * dim_;
+
+        // number of blocks of 32 vectors we have
+        int numBlocks = utils::divUp(numVecs, 32);
+
+        // total size to encode numVecs
+        return bytesPerBlock * numBlocks;
+    } else {
+        size_t sizePerVector =
+                (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
+
+        return (size_t)numVecs * sizePerVector;
+    }
+}
+
+size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const {
+    size_t sizePerVector =
+            (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
+
+    return (size_t)numVecs * sizePerVector;
+}
+
+std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
+        std::vector<uint8_t> codes,
+        size_t numVecs) const {
+    if (!interleavedLayout_) {
+        // same format
+        return codes;
+    }
+
+    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+
+    auto up =
+            unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
+    return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
+}
+
+std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
+        std::vector<uint8_t> codes,
+        size_t numVecs) const {
+    if (!interleavedLayout_) {
+        // same format
+        return codes;
+    }
+
+    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+
+    auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
+    return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
+}
+
+
+void RaftIVFFlat::appendVectors_(
+        Tensor<float, 2, true>& vecs,
+        Tensor<float, 2, true>& ivfCentroidResiduals,
+        Tensor<Index::idx_t, 1, true>& indices,
+        Tensor<Index::idx_t, 1, true>& uniqueLists,
+        Tensor<int, 1, true>& vectorsByUniqueList,
+        Tensor<int, 1, true>& uniqueListVectorStart,
+        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<Index::idx_t, 1, true>& listIds,
+        Tensor<int, 1, true>& listOffset,
+        cudaStream_t stream) {
+    //
+    // Append the new encodings
+    //
+
+    // TODO: Fill in this logic here
+}
+
+void RaftIVFFlat::searchImpl_(
+        Tensor<float, 2, true>& queries,
+        Tensor<float, 2, true>& coarseDistances,
+        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<float, 3, true>& ivfCentroids,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<Index::idx_t, 2, true>& outIndices,
+        bool storePairs) {
+    FAISS_ASSERT(storePairs == false);
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    // TODO: Fill in this logic here.
+
+    // If the GPU isn't storing indices (they are on the CPU side), we
+    // need to perform the re-mapping here
+    // FIXME: we might ultimately be calling this function with inputs
+    // from the CPU, these are unnecessary copies
+    if (indicesOptions_ == INDICES_CPU) {
+        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
+
+        ivfOffsetToUserIndex(
+                hostOutIndices.data(),
+                numLists_,
+                hostOutIndices.getSize(0),
+                hostOutIndices.getSize(1),
+                listOffsetToUserIndex_);
+
+        // Copy back to GPU, since the input to this function is on the
+        // GPU
+        outIndices.copyFrom(hostOutIndices, stream);
+    }
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
new file mode 100644
index 0000000000..16754a7fe8
--- /dev/null
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
@@ -0,0 +1,86 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <raft/core/handle.hpp>
+#include <raft/spatial/knn/ivf_flat_types.hpp>
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+
+#include <optional>
+
+namespace faiss {
+namespace gpu {
+
+class RaftIVFFlat : public IVFFlat {
+   public:
+    RaftIVFFlat(GpuResources* resources,
+            int dim,
+            int nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            bool useResidual,
+            /// Optional ScalarQuantizer
+            faiss::ScalarQuantizer* scalarQ,
+            bool interleavedLayout,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
+    ~RaftIVFFlat() override;
+
+   protected:
+    /// Returns the number of bytes in which an IVF list containing numVecs
+    /// vectors is encoded on the device. Note that due to padding this is not
+    /// the same as the encoding size for a subset of vectors in an IVF list;
+    /// this is the size for an entire IVF list
+    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
+    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
+
+    /// Translate to our preferred GPU encoding
+    std::vector<uint8_t> translateCodesToGpu_(
+            std::vector<uint8_t> codes,
+            size_t numVecs) const override;
+
+    /// Translate from our preferred GPU encoding
+    std::vector<uint8_t> translateCodesFromGpu_(
+            std::vector<uint8_t> codes,
+            size_t numVecs) const override;
+
+    /// Encode the vectors that we're adding and append to our IVF lists
+    void appendVectors_(
+            Tensor<float, 2, true>& vecs,
+            Tensor<float, 2, true>& ivfCentroidResiduals,
+            Tensor<Index::idx_t, 1, true>& indices,
+            Tensor<Index::idx_t, 1, true>& uniqueLists,
+            Tensor<int, 1, true>& vectorsByUniqueList,
+            Tensor<int, 1, true>& uniqueListVectorStart,
+            Tensor<int, 1, true>& uniqueListStartOffset,
+            Tensor<Index::idx_t, 1, true>& listIds,
+            Tensor<int, 1, true>& listOffset,
+            cudaStream_t stream) override;
+
+    /// Shared IVF search implementation, used by both search and
+    /// searchPreassigned
+    void searchImpl_(
+            Tensor<float, 2, true>& queries,
+            Tensor<float, 2, true>& coarseDistances,
+            Tensor<Index::idx_t, 2, true>& coarseIndices,
+            Tensor<float, 3, true>& ivfCentroids,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<Index::idx_t, 2, true>& outIndices,
+            bool storePairs);
+
+   protected:
+    std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
+
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu
similarity index 99%
rename from faiss/gpu/raft/RaftIndexIVFFlat.cu
rename to faiss/gpu/impl/raft/RaftIndexIVFFlat.cu
index a877e2419d..03df717c69 100644
--- a/faiss/gpu/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu
@@ -12,7 +12,7 @@
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
+#include <faiss/gpu/impl/raft/RaftIndexIVFFlat.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/impl/raft/RaftIndexIVFFlat.h
similarity index 100%
rename from faiss/gpu/raft/RaftIndexIVFFlat.h
rename to faiss/gpu/impl/raft/RaftIndexIVFFlat.h
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
similarity index 99%
rename from faiss/gpu/raft/RaftIndexIVFPQ.cu
rename to faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
index a7056ff5ae..f30f34259f 100644
--- a/faiss/gpu/raft/RaftIndexIVFPQ.cu
+++ b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
@@ -8,7 +8,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/raft/RaftIndexIVFPQ.h>
+#include <faiss/gpu/impl/raft/RaftIndexIVFPQ.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/utils/utils.h>
diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/impl/raft/RaftIndexIVFPQ.h
similarity index 100%
rename from faiss/gpu/raft/RaftIndexIVFPQ.h
rename to faiss/gpu/impl/raft/RaftIndexIVFPQ.h
diff --git a/faiss/gpu/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp
similarity index 97%
rename from faiss/gpu/raft/RmmGpuResources.hpp
rename to faiss/gpu/impl/raft/RmmGpuResources.hpp
index e3bc306729..d9e87fb0ad 100644
--- a/faiss/gpu/raft/RmmGpuResources.hpp
+++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp
@@ -29,6 +29,7 @@ in this file : https://github.com/facebookresearch/faiss/issues/2097
 #include <unordered_map>
 #include <vector>
 
+#include <raft/core/handle.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
@@ -221,6 +222,10 @@ class RmmGpuResourcesImpl : public GpuResources {
         }
 
         userDefaultStreams_[device] = stream;
+
+#ifdef FAISS_ENABLE_RAFT
+        raftHandles_[device] = raft::handle_t(stream);
+#endif
     };
 
     /// Revert the default stream to the original stream managed by this resources
@@ -242,6 +247,9 @@ class RmmGpuResourcesImpl : public GpuResources {
         }
 
         userDefaultStreams_.erase(device);
+#ifdef FAISS_ENABLE_RAFT
+    raftHandles_.erase(device);
+#endif
     };
 
     /// Returns the stream for the given device on which all Faiss GPU work is
@@ -326,6 +334,8 @@ class RmmGpuResourcesImpl : public GpuResources {
         alternateStreams_[device] = std::move(deviceStreams);
 
         // Create cuBLAS handle
+
+        // TODO: We need to be able to use this cublas handle within the raft handle
         cublasHandle_t blasHandle = 0;
         auto blasStatus           = cublasCreate(&blasHandle);
         FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
@@ -492,6 +502,12 @@ class RmmGpuResourcesImpl : public GpuResources {
         return defaultStreams_.count(device) != 0;
     };
 
+    std::unique_ptr<raft::handle_t> getRaftHandle(int device) const {
+        auto it = raftHandles_.find(device);
+        FAISS_ASSERT(it != raftHandles_.end());
+        return it->second;
+    }
+
     /// Adjust the default temporary memory allocation based on the total GPU
     /// memory size
     static size_t getDefaultTempMemForGPU(int device, size_t requested)
@@ -562,6 +578,10 @@ class RmmGpuResourcesImpl : public GpuResources {
 
     // pinned_memory_resource
     std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+
+    /// Our raft handle that maintains additional library resources, one per each device
+    std::unordered_map<int, std::unique_ptr<raft::handle_t>> raftHandles_;
+
 };
 
 /// Default implementation of GpuResources that allocates a cuBLAS

From a7e0cddcab4c513f7d9d92ebe313dee89a017dd5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 19 Oct 2022 17:23:29 -0400
Subject: [PATCH 31/87] IVF Flat

---
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index 8d784c0593..21ac260887 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -7,8 +7,8 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/raft/RaftIndexIVFFlat.h>
-#include <faiss/gpu/raft/RmmGpuResources.hpp>
+#include <faiss/gpu/impl/raft/RaftIndexIVFFlat.h>
+#include <faiss/gpu/impl/raft/RmmGpuResources.hpp>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 

From fbf7e3425448bdf9a72316dcd943769c6ec7c39e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 19 Oct 2022 17:57:18 -0400
Subject: [PATCH 32/87] More updates

---
 faiss/gpu/GpuResources.h                | 1 -
 faiss/gpu/StandardGpuResources.cpp      | 8 --------
 faiss/gpu/StandardGpuResources.h        | 8 +-------
 faiss/gpu/impl/raft/RmmGpuResources.hpp | 7 -------
 4 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 8824791f12..5d627fcb09 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -195,7 +195,6 @@ class GpuResources {
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     virtual raft::handle_t &getRaftHandle(int device) const;
-
     raft::handle_t &getRaftHandleCurrentDevice() const;
 
     /// Overrides the default stream for a device to the user-supplied stream.
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 6120fd0bc7..fd99074eda 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -5,9 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#ifdef FAISS_ENABLE_RAFT
 #include <raft/core/handle.hpp>
-#endif
 
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -317,10 +315,8 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
-#ifdef FAISS_ENABLE_RAFT
     raft::handle_t handle(defaultStream);
     raftHandles_[device] = handle;
-#endif
 
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
@@ -384,7 +380,6 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-#ifdef FAISS_ENABLE_RAFT
 raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const {
     initializeForDevice(device);
 
@@ -398,7 +393,6 @@ raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const {
     return raftHandles_[device];
 
 }
-#endif
 
 std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
         int device) {
@@ -625,11 +619,9 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
-#ifdef FAISS_ENABLE_RAFT
  raft::handle_t &StandardGpuResources::getRaftHandle(int device) const {
     return res_->getRaftHandle(device);
 }
-#endif
 
 size_t StandardGpuResources::getTempMemoryAvailable(int device) const {
     return res_->getTempMemoryAvailable(device);
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 408221e4b9..115b34a2fd 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -60,11 +60,9 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// this stream upon exit from an index or other Faiss GPU call.
     cudaStream_t getDefaultStream(int device) override;
 
-#ifdef FAISS_ENABLE_RAFT
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     raft::handle_t &getRaftHandle(int device) const override;
-#endif
 
     /// Called to change the work ordering streams to the null stream
     /// for all devices
@@ -132,10 +130,8 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// cuBLAS handle for each device
     std::unordered_map<int, cublasHandle_t> blasHandles_;
 
-#ifdef FAISS_ENABLE_RAFT
     /// raft handle for each device
     std::unordered_map<int, raft::handle_t> raftHandles_;
-#endif
 
     /// Pinned memory allocation for use with this GPU
     void* pinnedMemAlloc_;
@@ -199,11 +195,9 @@ class StandardGpuResources : public GpuResourcesProvider {
     /// Returns the current default stream
     cudaStream_t getDefaultStream(int device);
 
-#ifdef FAISS_ENABLE_RAFT
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    raft::handle_t &getRaftHandle(int device) const override;
-#endif
+    raft::handle_t &getRaftHandle(int device) const;
 
     /// Returns the current amount of temp memory available
     size_t getTempMemoryAvailable(int device) const;
diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp
index d9e87fb0ad..409e33f402 100644
--- a/faiss/gpu/impl/raft/RmmGpuResources.hpp
+++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp
@@ -222,10 +222,6 @@ class RmmGpuResourcesImpl : public GpuResources {
         }
 
         userDefaultStreams_[device] = stream;
-
-#ifdef FAISS_ENABLE_RAFT
-        raftHandles_[device] = raft::handle_t(stream);
-#endif
     };
 
     /// Revert the default stream to the original stream managed by this resources
@@ -247,9 +243,6 @@ class RmmGpuResourcesImpl : public GpuResources {
         }
 
         userDefaultStreams_.erase(device);
-#ifdef FAISS_ENABLE_RAFT
-    raftHandles_.erase(device);
-#endif
     };
 
     /// Returns the stream for the given device on which all Faiss GPU work is

From a9b69638c05a0b175e16983f58284f4ae9fc2e2c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 20 Oct 2022 18:50:03 -0400
Subject: [PATCH 33/87] Getting things building again. Adding raft handle to
 gpu resources.

---
 faiss/gpu/GpuResources.cpp              |  2 +-
 faiss/gpu/GpuResources.h                |  4 ++--
 faiss/gpu/StandardGpuResources.cpp      | 11 +++++------
 faiss/gpu/StandardGpuResources.h        |  4 ++--
 faiss/gpu/impl/raft/RaftIVFFlat.cu      | 20 ++++++++++++++++++++
 faiss/gpu/impl/raft/RmmGpuResources.hpp | 15 +++++++++++----
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index e964d13155..0129ddafd4 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -153,7 +153,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
     return getDefaultStream(getCurrentDevice());
 }
 
-raft::handle_t &GpuResources::getRaftHandleCurrentDevice() const {
+raft::handle_t &GpuResources::getRaftHandleCurrentDevice() {
     return getRaftHandle(getCurrentDevice());
 }
 
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 5d627fcb09..c286fbae82 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -194,8 +194,8 @@ class GpuResources {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    virtual raft::handle_t &getRaftHandle(int device) const;
-    raft::handle_t &getRaftHandleCurrentDevice() const;
+    virtual raft::handle_t &getRaftHandle(int device) = 0;
+    raft::handle_t &getRaftHandleCurrentDevice();
 
     /// Overrides the default stream for a device to the user-supplied stream.
     /// The resources object does not own this stream (i.e., it will not destroy
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index fd99074eda..c593264ab0 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -315,8 +315,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
-    raft::handle_t handle(defaultStream);
-    raftHandles_[device] = handle;
+    raftHandles_.emplace(std::make_pair(device, defaultStream));
 
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
@@ -380,16 +379,16 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const {
+raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
     auto it = raftHandles_.find(device);
     if (it != raftHandles_.end()) {
-        // There is a user override stream set
+        // There is a user override handle set
         return it->second;
     }
 
-    // Otherwise, our base default stream
+    // Otherwise, our base default handle
     return raftHandles_[device];
 
 }
@@ -619,7 +618,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
- raft::handle_t &StandardGpuResources::getRaftHandle(int device) const {
+ raft::handle_t &StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }
 
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 115b34a2fd..e28d89a492 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -62,7 +62,7 @@ class StandardGpuResourcesImpl : public GpuResources {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    raft::handle_t &getRaftHandle(int device) const override;
+    raft::handle_t &getRaftHandle(int device) override;
 
     /// Called to change the work ordering streams to the null stream
     /// for all devices
@@ -197,7 +197,7 @@ class StandardGpuResources : public GpuResourcesProvider {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    raft::handle_t &getRaftHandle(int device) const;
+    raft::handle_t &getRaftHandle(int device);
 
     /// Returns the current amount of temp memory available
     size_t getTempMemoryAvailable(int device) const;
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu
index 17c8581ea8..9f24a244f6 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu
@@ -151,6 +151,26 @@ void RaftIVFFlat::searchImpl_(
 
     // TODO: Fill in this logic here.
 
+//    // Device is already set in GpuIndex::search
+//    FAISS_ASSERT(raft_knn_index.has_value());
+//    FAISS_ASSERT(n > 0);
+//    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+//
+//    raft::spatial::knn::ivf_flat::search_params pams;
+//    pams.n_probes = nprobe;
+//    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
+//            raft_handle,
+//            pams,
+//            *raft_knn_index,
+//            const_cast<float*>(x),
+//            static_cast<std::uint32_t>(n),
+//            static_cast<std::uint32_t>(k),
+//            labels,
+//            distances);
+//
+//    raft_handle.sync_stream();
+
+
     // If the GPU isn't storing indices (they are on the CPU side), we
     // need to perform the re-mapping here
     // FIXME: we might ultimately be calling this function with inputs
diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp
index 409e33f402..c22c722a35 100644
--- a/faiss/gpu/impl/raft/RmmGpuResources.hpp
+++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp
@@ -495,10 +495,17 @@ class RmmGpuResourcesImpl : public GpuResources {
         return defaultStreams_.count(device) != 0;
     };
 
-    std::unique_ptr<raft::handle_t> getRaftHandle(int device) const {
+    raft::handle_t &getRaftHandle(int device) {
+        initializeForDevice(device);
+
         auto it = raftHandles_.find(device);
-        FAISS_ASSERT(it != raftHandles_.end());
-        return it->second;
+        if (it != raftHandles_.end()) {
+            // There is a user override handle set
+            return it->second;
+        }
+
+        // Otherwise, our base default handle
+        return raftHandles_[device];
     }
 
     /// Adjust the default temporary memory allocation based on the total GPU
@@ -573,7 +580,7 @@ class RmmGpuResourcesImpl : public GpuResources {
     std::unique_ptr<rmm::mr::host_memory_resource> pmr;
 
     /// Our raft handle that maintains additional library resources, one per each device
-    std::unordered_map<int, std::unique_ptr<raft::handle_t>> raftHandles_;
+    std::unordered_map<int, raft::handle_t> raftHandles_;
 
 };
 

From b640ba8c74b46f1c12b290908311a1f29eade33e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 26 Oct 2022 17:10:22 -0400
Subject: [PATCH 34/87] Getting FAISS building again w/ RaftIVFFlat

---
 CMakeLists.txt                      |   2 +-
 cmake/thirdparty/get_raft.cmake     |   4 +-
 faiss/gpu/GpuIndexIVFFlat.h         |   1 +
 faiss/gpu/impl/IVFFlat.cu           |   8 ++
 faiss/gpu/impl/raft/RaftIVFFlat.cu  | 122 ++++++++++++++++++----------
 faiss/gpu/impl/raft/RaftIVFFlat.cuh |  34 +++++---
 6 files changed, 116 insertions(+), 55 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0ff1eceb6..5273da200a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,7 @@ include(GNUInstallDirs)
 if(FAISS_ENABLE_RAFT)
 set(CMAKE_CXX_STANDARD 17)
 else()
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 endif()
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 91f53b0f4d..2b7825d193 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -16,8 +16,8 @@
 
 
 set(RAFT_VERSION "${RAPIDS_VERSION}")
-set(RAFT_FORK "rapidsai")
-set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}")
+set(RAFT_FORK "cjnolet")
+set(RAFT_PINNED_TAG "bug-2212-ivf_flat_apis")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index a519f11cc2..c8ab0068da 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -9,6 +9,7 @@
 
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/gpu/GpuIndexIVF.h>
+
 #include <memory>
 
 namespace faiss {
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index dd8b1c86a2..a42e06cde3 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -286,8 +286,16 @@ void IVFFlat::searchPreassigned(
 
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
+
+        /**
+         *
+         */
         Tensor<float, 2, true>& coarseDistances,
         Tensor<Index::idx_t, 2, true>& coarseIndices,
+
+        /**
+         * This is raft::neighbors::ivf_flat::index::centers_
+         */
         Tensor<float, 3, true>& ivfCentroids,
         int k,
         Tensor<float, 2, true>& outDistances,
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu
index 9f24a244f6..1f01f74242 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu
@@ -5,8 +5,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-
+#include <cstdint>
 #include <raft/core/handle.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/spatial/knn/ivf_flat.cuh>
 
 #include <faiss/gpu/GpuIndex.h>
@@ -57,6 +58,45 @@ RaftIVFFlat::RaftIVFFlat(
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
+
+/// Find the approximate k nearest neigbors for `queries` against
+/// our database
+void RaftIVFFlat::search(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& queries,
+        int nprobe,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<Index::idx_t, 2, true>& outIndices) {
+
+    // TODO: We probably don't want to ignore the coarse quantizer here...
+
+    std::uint32_t n = queries.getSize(0);
+    std::uint32_t cols = queries.getSize(1);
+    std::uint32_t k_ = k;
+
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(n > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
+
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    raft::spatial::knn::ivf_flat::search_params pams;
+    pams.n_probes = nprobe;
+
+    // TODO: 
+
+    auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
+    auto out_inds_view = raft::make_device_matrix_view<Index::idx_t>(outIndices.data(), n, k_);
+    auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
+    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
+            raft_handle, *raft_knn_index, queries_view,
+            out_inds_view, out_dists_view, pams, k_);
+
+    raft_handle.sync_stream();
+}
+
+
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
     if (interleavedLayout_) {
         // bits per scalar code
@@ -136,26 +176,26 @@ void RaftIVFFlat::appendVectors_(
     // TODO: Fill in this logic here
 }
 
-void RaftIVFFlat::searchImpl_(
-        Tensor<float, 2, true>& queries,
-        Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
-        Tensor<float, 3, true>& ivfCentroids,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
-        bool storePairs) {
-    FAISS_ASSERT(storePairs == false);
-
-    auto stream = resources_->getDefaultStreamCurrentDevice();
-
-    // TODO: Fill in this logic here.
-
-//    // Device is already set in GpuIndex::search
-//    FAISS_ASSERT(raft_knn_index.has_value());
-//    FAISS_ASSERT(n > 0);
-//    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+//void RaftIVFFlat::searchImpl_(
+//        Tensor<float, 2, true>& queries,
+//        Tensor<float, 2, true>& coarseDistances,
+//        Tensor<Index::idx_t, 2, true>& coarseIndices,
+//        Tensor<float, 3, true>& ivfCentroids,
+//        int k,
+//        Tensor<float, 2, true>& outDistances,
+//        Tensor<Index::idx_t, 2, true>& outIndices,
+//        bool storePairs) {
+//    FAISS_ASSERT(storePairs == false);
 //
+//    auto stream = resources_->getDefaultStreamCurrentDevice();
+//
+//    // TODO: Fill in this logic here.
+//
+////    // Device is already set in GpuIndex::search
+////    FAISS_ASSERT(raft_knn_index.has_value());
+////    FAISS_ASSERT(n > 0);
+////    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
+////
 //    raft::spatial::knn::ivf_flat::search_params pams;
 //    pams.n_probes = nprobe;
 //    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
@@ -169,27 +209,27 @@ void RaftIVFFlat::searchImpl_(
 //            distances);
 //
 //    raft_handle.sync_stream();
-
-
-    // If the GPU isn't storing indices (they are on the CPU side), we
-    // need to perform the re-mapping here
-    // FIXME: we might ultimately be calling this function with inputs
-    // from the CPU, these are unnecessary copies
-    if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
-
-        ivfOffsetToUserIndex(
-                hostOutIndices.data(),
-                numLists_,
-                hostOutIndices.getSize(0),
-                hostOutIndices.getSize(1),
-                listOffsetToUserIndex_);
-
-        // Copy back to GPU, since the input to this function is on the
-        // GPU
-        outIndices.copyFrom(hostOutIndices, stream);
-    }
-}
+//
+//
+//    // If the GPU isn't storing indices (they are on the CPU side), we
+//    // need to perform the re-mapping here
+//    // FIXME: we might ultimately be calling this function with inputs
+//    // from the CPU, these are unnecessary copies
+//    if (indicesOptions_ == INDICES_CPU) {
+//        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
+//
+//        ivfOffsetToUserIndex(
+//                hostOutIndices.data(),
+//                numLists_,
+//                hostOutIndices.getSize(0),
+//                hostOutIndices.getSize(1),
+//                listOffsetToUserIndex_);
+//
+//        // Copy back to GPU, since the input to this function is on the
+//        // GPU
+//        outIndices.copyFrom(hostOutIndices, stream);
+//    }
+//}
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
index 16754a7fe8..05ca705588 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
@@ -34,6 +34,18 @@ class RaftIVFFlat : public IVFFlat {
 
     ~RaftIVFFlat() override;
 
+
+    /// Find the approximate k nearest neigbors for `queries` against
+    /// our database
+    void search(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& queries,
+            int nprobe,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<Index::idx_t, 2, true>& outIndices) override;
+
+
    protected:
     /// Returns the number of bytes in which an IVF list containing numVecs
     /// vectors is encoded on the device. Note that due to padding this is not
@@ -65,17 +77,17 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<int, 1, true>& listOffset,
             cudaStream_t stream) override;
 
-    /// Shared IVF search implementation, used by both search and
-    /// searchPreassigned
-    void searchImpl_(
-            Tensor<float, 2, true>& queries,
-            Tensor<float, 2, true>& coarseDistances,
-            Tensor<Index::idx_t, 2, true>& coarseIndices,
-            Tensor<float, 3, true>& ivfCentroids,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
-            bool storePairs);
+//    /// Shared IVF search implementation, used by both search and
+//    /// searchPreassigned
+//    void searchImpl_(
+//            Tensor<float, 2, true>& queries,
+//            Tensor<float, 2, true>& coarseDistances,
+//            Tensor<Index::idx_t, 2, true>& coarseIndices,
+//            Tensor<float, 3, true>& ivfCentroids,
+//            int k,
+//            Tensor<float, 2, true>& outDistances,
+//            Tensor<Index::idx_t, 2, true>& outIndices,
+//            bool storePairs);
 
    protected:
     std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};

From af6d1e9c998883eb60d7d0efba90a739255fde9f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 26 Oct 2022 18:46:33 -0400
Subject: [PATCH 35/87] Adding the append vectors to raft index IVF flat.

---
 faiss/gpu/GpuIndexIVF.cu            |  28 ++++--
 faiss/gpu/impl/IVFBase.cu           |   1 +
 faiss/gpu/impl/raft/RaftIVFFlat.cu  | 148 ++++------------------------
 faiss/gpu/impl/raft/RaftIVFFlat.cuh |  52 ++--------
 4 files changed, 47 insertions(+), 182 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index ff4eb974b9..bfd5f16c8d 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -5,6 +5,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/gpu/GpuCloner.h>
@@ -460,17 +465,24 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
     }
 
     if(config_.use_raft) {
-        /**
-         * TODO: Plug in clustering logic here.
-         *
-         * Essentially what we need here is to use `x` as the training data set
-         * to train the k-means centroids and add them to the quantizer
-         * implementation.
-         */
 
+        const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+        raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_n_iters = 100;
 
+        auto raft_index = raft::neighbors::ivf_flat::build(
+            raft_handle, raft_idx_params, x, n, (Index::idx_t)d);
 
+        raft_handle.sync_stream();
 
+        // TODO: Validate this is all we need to do
+        quantizer->reset();
+        quantizer->train(nlist, raft_index.centers().data_handle());
+        quantizer->add(nlist, raft_index.centers().data_handle());
 
     } else {
         // leverage the CPU-side k-means code, which works for the GPU
@@ -479,9 +491,9 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
         Clustering clus(this->d, nlist, this->cp);
         clus.verbose = verbose;
         clus.train(n, x, *quantizer);
-        quantizer->is_trained = true;
     }
 
+    quantizer->is_trained = true;
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
 
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index b3995e106d..0e6cbc85f9 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -582,6 +582,7 @@ void IVFBase::searchCoarseQuantizer_(
     }
 }
 
+// TODO: Is it best to plug in here?
 int IVFBase::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu
index 1f01f74242..f12d7a7c7d 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu
@@ -84,8 +84,6 @@ void RaftIVFFlat::search(
     raft::spatial::knn::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
-    // TODO: 
-
     auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
     auto out_inds_view = raft::make_device_matrix_view<Index::idx_t>(outIndices.data(), n, k_);
     auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
@@ -96,140 +94,28 @@ void RaftIVFFlat::search(
     raft_handle.sync_stream();
 }
 
+/// Classify and encode/add vectors to our IVF lists.
+/// The input data must be on our current device.
+/// Returns the number of vectors successfully added. Vectors may
+/// not be able to be added because they contain NaNs.
+int RaftIVFFlat::addVectors(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<Index::idx_t, 1, true>& indices) {
 
-size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
-    if (interleavedLayout_) {
-        // bits per scalar code
-        int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
-
-        // bytes to encode a block of 32 vectors (single dimension)
-        int bytesPerDimBlock = bits * 32 / 8;
-
-        // bytes to fully encode 32 vectors
-        int bytesPerBlock = bytesPerDimBlock * dim_;
-
-        // number of blocks of 32 vectors we have
-        int numBlocks = utils::divUp(numVecs, 32);
-
-        // total size to encode numVecs
-        return bytesPerBlock * numBlocks;
-    } else {
-        size_t sizePerVector =
-                (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-        return (size_t)numVecs * sizePerVector;
-    }
-}
-
-size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const {
-    size_t sizePerVector =
-            (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
-
-    return (size_t)numVecs * sizePerVector;
-}
-
-std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
-        std::vector<uint8_t> codes,
-        size_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up =
-            unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
-
-std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
-        std::vector<uint8_t> codes,
-        size_t numVecs) const {
-    if (!interleavedLayout_) {
-        // same format
-        return codes;
-    }
-
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-
-    auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
-    return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
-}
+    auto vecs_view = raft::make_device_matrix_view<const float, Index::idx_t>(vecs.data(), vecs.getSize(0), dim_);
+    auto inds_view = raft::make_device_vector_view<const Index::idx_t, Index::idx_t>(indices.data(), (Index::idx_t )indices.getSize(0));
 
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
-void RaftIVFFlat::appendVectors_(
-        Tensor<float, 2, true>& vecs,
-        Tensor<float, 2, true>& ivfCentroidResiduals,
-        Tensor<Index::idx_t, 1, true>& indices,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        cudaStream_t stream) {
-    //
-    // Append the new encodings
-    //
-
-    // TODO: Fill in this logic here
+    // TODO: We probably don't want to ignore the coarse quantizer here
+    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
+            raft_handle,
+            raft_knn_index.value(),
+            vecs_view,
+            std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
 }
 
-//void RaftIVFFlat::searchImpl_(
-//        Tensor<float, 2, true>& queries,
-//        Tensor<float, 2, true>& coarseDistances,
-//        Tensor<Index::idx_t, 2, true>& coarseIndices,
-//        Tensor<float, 3, true>& ivfCentroids,
-//        int k,
-//        Tensor<float, 2, true>& outDistances,
-//        Tensor<Index::idx_t, 2, true>& outIndices,
-//        bool storePairs) {
-//    FAISS_ASSERT(storePairs == false);
-//
-//    auto stream = resources_->getDefaultStreamCurrentDevice();
-//
-//    // TODO: Fill in this logic here.
-//
-////    // Device is already set in GpuIndex::search
-////    FAISS_ASSERT(raft_knn_index.has_value());
-////    FAISS_ASSERT(n > 0);
-////    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
-////
-//    raft::spatial::knn::ivf_flat::search_params pams;
-//    pams.n_probes = nprobe;
-//    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
-//            raft_handle,
-//            pams,
-//            *raft_knn_index,
-//            const_cast<float*>(x),
-//            static_cast<std::uint32_t>(n),
-//            static_cast<std::uint32_t>(k),
-//            labels,
-//            distances);
-//
-//    raft_handle.sync_stream();
-//
-//
-//    // If the GPU isn't storing indices (they are on the CPU side), we
-//    // need to perform the re-mapping here
-//    // FIXME: we might ultimately be calling this function with inputs
-//    // from the CPU, these are unnecessary copies
-//    if (indicesOptions_ == INDICES_CPU) {
-//        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
-//
-//        ivfOffsetToUserIndex(
-//                hostOutIndices.data(),
-//                numLists_,
-//                hostOutIndices.getSize(0),
-//                hostOutIndices.getSize(1),
-//                listOffsetToUserIndex_);
-//
-//        // Copy back to GPU, since the input to this function is on the
-//        // GPU
-//        outIndices.copyFrom(hostOutIndices, stream);
-//    }
-//}
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
index 05ca705588..c2556c448f 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh
@@ -8,7 +8,7 @@
 #pragma once
 
 #include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
 
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
@@ -45,52 +45,18 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& outDistances,
             Tensor<Index::idx_t, 2, true>& outIndices) override;
 
-
-   protected:
-    /// Returns the number of bytes in which an IVF list containing numVecs
-    /// vectors is encoded on the device. Note that due to padding this is not
-    /// the same as the encoding size for a subset of vectors in an IVF list;
-    /// this is the size for an entire IVF list
-    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
-
-    /// Translate to our preferred GPU encoding
-    std::vector<uint8_t> translateCodesToGpu_(
-            std::vector<uint8_t> codes,
-            size_t numVecs) const override;
-
-    /// Translate from our preferred GPU encoding
-    std::vector<uint8_t> translateCodesFromGpu_(
-            std::vector<uint8_t> codes,
-            size_t numVecs) const override;
-
-    /// Encode the vectors that we're adding and append to our IVF lists
-    void appendVectors_(
+    /// Classify and encode/add vectors to our IVF lists.
+    /// The input data must be on our current device.
+    /// Returns the number of vectors successfully added. Vectors may
+    /// not be able to be added because they contain NaNs.
+    int addVectors(
+            Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
-            Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<Index::idx_t, 1, true>& indices,
-            Tensor<Index::idx_t, 1, true>& uniqueLists,
-            Tensor<int, 1, true>& vectorsByUniqueList,
-            Tensor<int, 1, true>& uniqueListVectorStart,
-            Tensor<int, 1, true>& uniqueListStartOffset,
-            Tensor<Index::idx_t, 1, true>& listIds,
-            Tensor<int, 1, true>& listOffset,
-            cudaStream_t stream) override;
+            Tensor<Index::idx_t, 1, true>& indices);
 
-//    /// Shared IVF search implementation, used by both search and
-//    /// searchPreassigned
-//    void searchImpl_(
-//            Tensor<float, 2, true>& queries,
-//            Tensor<float, 2, true>& coarseDistances,
-//            Tensor<Index::idx_t, 2, true>& coarseIndices,
-//            Tensor<float, 3, true>& ivfCentroids,
-//            int k,
-//            Tensor<float, 2, true>& outDistances,
-//            Tensor<Index::idx_t, 2, true>& outIndices,
-//            bool storePairs);
 
    protected:
-    std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
+    std::optional<raft::neighbors::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
 
 };
 

From 545b3d22f712921a3b43756cee35754a7883cbda Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 27 Oct 2022 11:56:59 -0400
Subject: [PATCH 36/87] Add ing flatindex for the fused l2 knn

---
 faiss/gpu/CMakeLists.txt                      |  12 +-
 faiss/gpu/GpuIndexFlat.cu                     |  38 +-
 faiss/gpu/GpuIndexFlat.h                      |   3 +
 faiss/gpu/GpuIndexIVFFlat.cu                  |   2 +-
 faiss/gpu/StandardGpuResources.h              |   4 +-
 faiss/gpu/impl/FlatIndex.cuh                  |   2 +-
 faiss/gpu/impl/IVFBase.cuh                    |  14 +-
 faiss/gpu/impl/RaftFlatIndex.cu               |  70 ++
 faiss/gpu/impl/RaftFlatIndex.cuh              |  42 ++
 faiss/gpu/impl/RaftIVFFlat.cu                 | 232 ++++++
 faiss/gpu/impl/{raft => }/RaftIVFFlat.cuh     |  33 +-
 faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.cu |   2 +-
 faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.h  |   0
 faiss/gpu/impl/raft/RaftIVFFlat.cu            | 121 ---
 faiss/gpu/impl/raft/RaftIndexIVFPQ.cu         | 419 -----------
 faiss/gpu/impl/raft/RaftIndexIVFPQ.h          | 165 ----
 faiss/gpu/impl/raft/RmmGpuResources.hpp       | 656 ----------------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp        |   1 +
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp       |  22 +-
 faiss/gpu/test/TestRaftIndexIVFPQ.cpp         | 704 ------------------
 20 files changed, 434 insertions(+), 2108 deletions(-)
 create mode 100644 faiss/gpu/impl/RaftFlatIndex.cu
 create mode 100644 faiss/gpu/impl/RaftFlatIndex.cuh
 create mode 100644 faiss/gpu/impl/RaftIVFFlat.cu
 rename faiss/gpu/impl/{raft => }/RaftIVFFlat.cuh (55%)
 rename faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.cu (99%)
 rename faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.h (100%)
 delete mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cu
 delete mode 100644 faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
 delete mode 100644 faiss/gpu/impl/raft/RaftIndexIVFPQ.h
 delete mode 100644 faiss/gpu/impl/raft/RmmGpuResources.hpp
 delete mode 100644 faiss/gpu/test/TestRaftIndexIVFPQ.cpp

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 8b373aecb8..0e82af813c 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -165,10 +165,14 @@ set(FAISS_GPU_HEADERS
 )
 
 if(FAISS_ENABLE_RAFT)
-  list(APPEND FAISS_GPU_HEADERS impl/raft/RaftIndexIVFFlat.h impl/raft/RaftIndexIVFPQ.h
-          impl/raft/RaftIVFFlat.cuh)
-  list(APPEND FAISS_GPU_SRC impl/raft/RaftIndexIVFFlat.cu impl/raft/RaftIndexIVFPQ.cu
-          impl/raft/RaftIVFFlat.cu)
+  list(APPEND FAISS_GPU_HEADERS
+          impl/RaftIndexIVFFlat.h
+          impl/RaftFlatIndex.cuh
+          impl/RaftIVFFlat.cuh)
+  list(APPEND FAISS_GPU_SRC
+          impl/RaftIndexIVFFlat.cu
+          impl/RaftFlatIndex.cu
+          impl/RaftIVFFlat.cu)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index b005f0eaf4..429eb64db7 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -12,6 +12,7 @@
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/RaftFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
@@ -67,11 +68,7 @@ GpuIndexFlat::GpuIndexFlat(
     this->is_trained = true;
 
     // Construct index
-    data_.reset(new FlatIndex(
-            resources_.get(),
-            dims,
-            flatConfig_.useFloat16,
-            config_.memorySpace));
+    resetIndex_(dims);
 }
 
 GpuIndexFlat::GpuIndexFlat(
@@ -86,15 +83,30 @@ GpuIndexFlat::GpuIndexFlat(
     this->is_trained = true;
 
     // Construct index
-    data_.reset(new FlatIndex(
-            resources_.get(),
-            dims,
-            flatConfig_.useFloat16,
-            config_.memorySpace));
+    resetIndex_(dims);
 }
 
 GpuIndexFlat::~GpuIndexFlat() {}
 
+void GpuIndexFlat::resetIndex_(int dims) {
+
+    if(config_.use_raft) {
+        data_.reset(new RaftFlatIndex(
+                resources_.get(),
+                dims,
+                flatConfig_.useFloat16,
+                config_.memorySpace));
+
+    } else {
+        data_.reset(new FlatIndex(
+                resources_.get(),
+                dims,
+                flatConfig_.useFloat16,
+                config_.memorySpace));
+    }
+}
+
+
 void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
     DeviceScope scope(config_.device);
 
@@ -109,11 +121,7 @@ void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
             (size_t)index->ntotal);
 
     data_.reset();
-    data_.reset(new FlatIndex(
-            resources_.get(),
-            this->d,
-            flatConfig_.useFloat16,
-            config_.memorySpace));
+    resetIndex_(this->d);
 
     // The index could be empty
     if (index->ntotal > 0) {
diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h
index 3cc56bc905..4982646159 100644
--- a/faiss/gpu/GpuIndexFlat.h
+++ b/faiss/gpu/GpuIndexFlat.h
@@ -116,6 +116,9 @@ class GpuIndexFlat : public GpuIndex {
     }
 
    protected:
+
+    void resetIndex_(int dims);
+
     /// Flat index does not require IDs as there is no storage available for
     /// them
     bool addImplRequiresIDs_() const override;
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 642058c4d4..033c7189c9 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -6,7 +6,7 @@
  */
 
 #include <raft/core/cudart_utils.hpp>
-#include <faiss/gpu/impl/raft/RaftIVFFlat.cuh>
+#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index e28d89a492..672f1b8339 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -98,7 +98,7 @@ class StandardGpuResourcesImpl : public GpuResources {
 
     cudaStream_t getAsyncCopyStream(int device) override;
 
-   private:
+   protected:
     /// Have GPU resources been initialized for this device yet?
     bool isInitialized(int device) const;
 
@@ -106,7 +106,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// memory size
     static size_t getDefaultTempMemForGPU(int device, size_t requested);
 
-   private:
+   protected:
     /// Set of currently outstanding memory allocations per device
     /// device -> (alloc request, allocated ptr)
     std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh
index bb66cd4b2b..56fbe609b9 100644
--- a/faiss/gpu/impl/FlatIndex.cuh
+++ b/faiss/gpu/impl/FlatIndex.cuh
@@ -80,7 +80,7 @@ class FlatIndex {
     /// Free all storage
     void reset();
 
-   private:
+   protected:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
 
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 5fadfe7f70..e15c4c958d 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -45,7 +45,7 @@ class IVFBase {
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
-    void reset();
+    virtual void reset();
 
     /// Return the number of dimensions we are indexing
     int getDim() const;
@@ -59,19 +59,19 @@ class IVFBase {
 
     /// For debugging purposes, return the list length of a particular
     /// list
-    int getListLength(int listId) const;
+    virtual int getListLength(int listId) const;
 
     /// Return the list indices of a particular list back to the CPU
-    std::vector<Index::idx_t> getListIndices(int listId) const;
+    virtual std::vector<Index::idx_t> getListIndices(int listId) const;
 
     /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const;
+    virtual std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const;
 
     /// Copy all inverted lists from a CPU representation to ourselves
-    void copyInvertedListsFrom(const InvertedLists* ivf);
+    virtual void copyInvertedListsFrom(const InvertedLists* ivf);
 
     /// Copy all inverted lists from ourselves to a CPU representation
-    void copyInvertedListsTo(InvertedLists* ivf);
+    virtual void copyInvertedListsTo(InvertedLists* ivf);
 
     /// Update our coarse quantizer with this quantizer instance; may be a CPU
     /// or GPU quantizer
@@ -81,7 +81,7 @@ class IVFBase {
     /// The input data must be on our current device.
     /// Returns the number of vectors successfully added. Vectors may
     /// not be able to be added because they contain NaNs.
-    int addVectors(
+    virtual int addVectors(
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<Index::idx_t, 1, true>& indices);
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
new file mode 100644
index 0000000000..1a8369a1c3
--- /dev/null
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -0,0 +1,70 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/RaftFlatIndex.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/distance/distance_types.hpp>
+//#include <raft/neighbors/brute_force.cuh>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace faiss {
+namespace gpu {
+
+RaftFlatIndex::RaftFlatIndex(
+        GpuResources* res,
+        int dim,
+        bool useFloat16,
+        MemorySpace space)
+        : FlatIndex(res, dim, useFloat16, space) {}
+
+void RaftFlatIndex::query(
+        Tensor<float, 2, true>& input,
+        int k,
+        faiss::MetricType metric,
+        float metricArg,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<int, 2, true>& outIndices,
+        bool exactDistance) {
+
+    // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used
+    if(k <= 64 && metric == MetricType::METRIC_L2 &&
+        input.getStride(0) == 0 && vectors_.getStride(0) == 0) {
+        raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+        auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded :
+                                      raft::distance::DistanceType::L2Expanded;
+
+        auto index = raft::make_device_matrix_view<float>(vectors_.data(), vectors_.getSize(0), vectors_.getSize(1));
+        auto search = raft::make_device_matrix_view<float>(input.data(), input.getSize(0), input.getSize(1));
+        auto inds = raft::make_device_matrix_view<int>(outIndices.data(), outIndices.getSize(0), outIndices.getSize(1));
+        auto dists = raft::make_device_matrix_view<float>(outDistances.data(), outDistances.getSize(0), outDistances.getSize(1));
+
+//        raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance);
+
+        // TODO: Expose the fused L2KNN through RAFT's public APIs
+        raft::spatial::knn::detail::fusedL2Knn(dim_,
+                   inds.data_handle(),
+                   dists.data_handle(),
+                   index.data_handle(),
+                   search.data_handle(),
+                   index.extent(0),
+                   search.extent(0),
+                   k,
+                   true,
+                   true,
+                   raft_handle.get_stream(),
+                   distance);
+
+        } else {
+        FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance);
+    }
+}
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
new file mode 100644
index 0000000000..8a18053449
--- /dev/null
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/MetricType.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceVector.cuh>
+
+namespace faiss {
+namespace gpu {
+
+class GpuResources;
+
+/// Holder of GPU resources for a particular flat index
+/// Can be in either float16 or float32 mode. If float32, we only store
+/// the vectors in float32.
+/// If float16, we store the vectors in both float16 and float32, where float32
+/// data is possibly needed for certain residual operations
+class RaftFlatIndex : public FlatIndex {
+   public:
+    RaftFlatIndex(GpuResources* res, int dim, bool useFloat16, MemorySpace space);
+
+    void query(
+            Tensor<float, 2, true>& vecs,
+            int k,
+            faiss::MetricType metric,
+            float metricArg,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<int, 2, true>& outIndices,
+            bool exactDistance);
+
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
new file mode 100644
index 0000000000..5563ca2eae
--- /dev/null
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -0,0 +1,232 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdint>
+#include <raft/core/handle.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/InterleavedCodes.h>
+#include <faiss/gpu/impl/RemapIndices.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <thrust/host_vector.h>
+#include <faiss/gpu/impl/RaftIVFFlat.cuh>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFAppend.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/IVFInterleaved.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+#include <limits>
+#include <unordered_map>
+
+namespace faiss {
+namespace gpu {
+
+RaftIVFFlat::RaftIVFFlat(
+        GpuResources* res,
+        int dim,
+        int nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space)
+        : IVFFlat(res,
+                  dim,
+                  nlist,
+                  metric,
+                  metricArg,
+                  useResidual,
+                  scalarQ,
+                  interleavedLayout,
+                  indicesOptions,
+                  space){}
+
+RaftIVFFlat::~RaftIVFFlat() {}
+
+
+/// Find the approximate k nearest neighbors for `queries` against
+/// our database
+void RaftIVFFlat::search(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& queries,
+        int nprobe,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<Index::idx_t, 2, true>& outIndices) {
+    printf("Inside RaftIVFFlat search()\n");
+
+    // TODO: We probably don't want to ignore the coarse quantizer here...
+
+    std::uint32_t n = queries.getSize(0);
+    std::uint32_t cols = queries.getSize(1);
+    std::uint32_t k_ = k;
+
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(n > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
+
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    raft::neighbors::ivf_flat::search_params pams;
+    pams.n_probes = nprobe;
+
+    auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
+    auto out_inds_view = raft::make_device_matrix_view<Index::idx_t>(outIndices.data(), n, k_);
+    auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
+    raft::neighbors::ivf_flat::search<float, faiss::Index::idx_t>(
+            raft_handle, *raft_knn_index, queries_view,
+            out_inds_view, out_dists_view, pams, k_);
+
+    raft_handle.sync_stream();
+}
+
+/// Classify and encode/add vectors to our IVF lists.
+/// The input data must be on our current device.
+/// Returns the number of vectors successfully added. Vectors may
+/// not be able to be added because they contain NaNs.
+int RaftIVFFlat::addVectors(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<Index::idx_t, 1, true>& indices) {
+    printf("Inside RaftIVFFlat addVectors()\n");
+
+    auto vecs_view = raft::make_device_matrix_view<const float, Index::idx_t>(vecs.data(), vecs.getSize(0), dim_);
+    auto inds_view = raft::make_device_vector_view<const Index::idx_t, Index::idx_t>(indices.data(), (Index::idx_t )indices.getSize(0));
+
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+    // TODO: We probably don't want to ignore the coarse quantizer here
+    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
+            raft_handle,
+            raft_knn_index.value(),
+            vecs_view,
+            std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
+    return vecs.getSize(0);
+}
+
+void RaftIVFFlat::reset() {
+    printf("Inside RaftIVFFlat reset()\n");
+    raft_knn_index.reset();
+}
+
+int RaftIVFFlat::getListLength(int listId) const {
+    printf("Inside RaftIVFFlat getListLength\n");
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+    uint32_t size;
+    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId,
+               1, raft_handle.get_stream());
+    raft_handle.sync_stream();
+    return int(size);
+}
+
+/// Return the list indices of a par
+/// ticular list back to the CPU
+std::vector<Index::idx_t> RaftIVFFlat::getListIndices(int listId) const {
+
+    printf("Inside RaftIVFFlat getListIndices\n");
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+    Index::idx_t offset;
+    uint32_t size;
+
+    raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream());
+    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
+    raft_handle.sync_stream();
+
+    std::vector<Index::idx_t> vec(size);
+    raft::copy(
+            vec.data(),
+            raft_knn_index.value().indices().data_handle() + offset,
+            size,
+            raft_handle.get_stream());
+    return vec;
+}
+
+/// Return the encoded vectors of a particular list back to the CPU
+std::vector<uint8_t> RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) const {
+
+    printf("Inside RaftIVFFlat getListVectorData\n");
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+
+    std::cout << "Calling getListVectorData for " << listId << std::endl;
+
+    using elem_t = decltype(raft_knn_index.value().data())::element_type;
+    size_t dim = raft_knn_index.value().dim();
+    Index::idx_t offsets[2];
+    raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream());
+
+    raft_handle.sync_stream();
+    size_t byte_offset = offsets[0] * sizeof(elem_t) * dim;
+    // the interleaved block can be slightly larger than the list size (it's
+    // rounded up)
+    size_t byte_size = size_t(offsets[1]) *
+                       sizeof(elem_t) * dim -
+                       byte_offset;
+    std::vector<uint8_t> vec(byte_size);
+    raft::copy(
+            vec.data(),
+            reinterpret_cast<const uint8_t*>(raft_knn_index.value().data().data_handle()) +
+            byte_offset,
+            byte_size,
+            raft_handle.get_stream());
+    return vec;
+}
+
+/// Performs search when we are already given the IVF cells to look at
+/// (GpuIndexIVF::search_preassigned implementation)
+void RaftIVFFlat::searchPreassigned(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<float, 2, true>& ivfDistances,
+        Tensor<Index::idx_t, 2, true>& ivfAssignments,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<Index::idx_t, 2, true>& outIndices,
+        bool storePairs) {
+    printf("Inside RaftIVFFlat searchPreassigned\n");
+
+    // TODO: Fill this in!
+}
+
+/// Copy all inverted lists from a CPU representation to ourselves
+void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+    printf("Inside RaftIVFFlat copyInvertedListsFrom\n");
+
+    ivf->print_stats();
+
+    // TODO: Need to replicate copyInvertedListsFrom() in IVFBase.cu
+    // but populate a RAFT index.
+}
+
+/// Copy all inverted lists from ourselves to a CPU representation
+void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
+    printf("Inside RaftIVFFlat copyInvertedListsTo\n");
+
+    // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu
+}
+
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
similarity index 55%
rename from faiss/gpu/impl/raft/RaftIVFFlat.cuh
rename to faiss/gpu/impl/RaftIVFFlat.cuh
index c2556c448f..1078204270 100644
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -45,6 +45,18 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& outDistances,
             Tensor<Index::idx_t, 2, true>& outIndices) override;
 
+    /// Performs search when we are already given the IVF cells to look at
+    /// (GpuIndexIVF::search_preassigned implementation)
+    void searchPreassigned(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& vecs,
+            Tensor<float, 2, true>& ivfDistances,
+            Tensor<Index::idx_t, 2, true>& ivfAssignments,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<Index::idx_t, 2, true>& outIndices,
+            bool storePairs) override;
+
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
     /// Returns the number of vectors successfully added. Vectors may
@@ -52,8 +64,27 @@ class RaftIVFFlat : public IVFFlat {
     int addVectors(
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
-            Tensor<Index::idx_t, 1, true>& indices);
+            Tensor<Index::idx_t, 1, true>& indices) override;
+
+    /// Clear out all inverted lists, but retain the coarse quantizer
+    /// and the product quantizer info
+    void reset() override;
+
+    /// For debugging purposes, return the list length of a particular
+    /// list
+    int getListLength(int listId) const override;
+
+    /// Return the list indices of a particular list back to the CPU
+    std::vector<Index::idx_t> getListIndices(int listId) const override;
+
+    /// Return the encoded vectors of a particular list back to the CPU
+    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const override;
+
+    /// Copy all inverted lists from a CPU representation to ourselves
+    void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
+    /// Copy all inverted lists from ourselves to a CPU representation
+    void copyInvertedListsTo(InvertedLists* ivf) override;
 
    protected:
     std::optional<raft::neighbors::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
diff --git a/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu
similarity index 99%
rename from faiss/gpu/impl/raft/RaftIndexIVFFlat.cu
rename to faiss/gpu/impl/RaftIndexIVFFlat.cu
index 03df717c69..88c5629e71 100644
--- a/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIndexIVFFlat.cu
@@ -12,7 +12,7 @@
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/raft/RaftIndexIVFFlat.h>
+#include <faiss/gpu/impl/RaftIndexIVFFlat.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
diff --git a/faiss/gpu/impl/raft/RaftIndexIVFFlat.h b/faiss/gpu/impl/RaftIndexIVFFlat.h
similarity index 100%
rename from faiss/gpu/impl/raft/RaftIndexIVFFlat.h
rename to faiss/gpu/impl/RaftIndexIVFFlat.h
diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu
deleted file mode 100644
index f12d7a7c7d..0000000000
--- a/faiss/gpu/impl/raft/RaftIVFFlat.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdint>
-#include <raft/core/handle.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/spatial/knn/ivf_flat.cuh>
-
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
-#include <faiss/gpu/impl/raft/RaftIVFFlat.cuh>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
-#include <faiss/gpu/utils/Transpose.cuh>
-#include <limits>
-#include <unordered_map>
-
-namespace faiss {
-namespace gpu {
-
-RaftIVFFlat::RaftIVFFlat(
-        GpuResources* res,
-        int dim,
-        int nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space)
-        : IVFFlat(res,
-                  dim,
-                  nlist,
-                  metric,
-                  metricArg,
-                  useResidual,
-                  scalarQ,
-                  interleavedLayout,
-                  indicesOptions,
-                  space){}
-
-RaftIVFFlat::~RaftIVFFlat() {}
-
-
-/// Find the approximate k nearest neigbors for `queries` against
-/// our database
-void RaftIVFFlat::search(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& queries,
-        int nprobe,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
-
-    // TODO: We probably don't want to ignore the coarse quantizer here...
-
-    std::uint32_t n = queries.getSize(0);
-    std::uint32_t cols = queries.getSize(1);
-    std::uint32_t k_ = k;
-
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(raft_knn_index.has_value());
-    FAISS_ASSERT(n > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
-
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
-    raft::spatial::knn::ivf_flat::search_params pams;
-    pams.n_probes = nprobe;
-
-    auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
-    auto out_inds_view = raft::make_device_matrix_view<Index::idx_t>(outIndices.data(), n, k_);
-    auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
-    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
-            raft_handle, *raft_knn_index, queries_view,
-            out_inds_view, out_dists_view, pams, k_);
-
-    raft_handle.sync_stream();
-}
-
-/// Classify and encode/add vectors to our IVF lists.
-/// The input data must be on our current device.
-/// Returns the number of vectors successfully added. Vectors may
-/// not be able to be added because they contain NaNs.
-int RaftIVFFlat::addVectors(
-        Index* coarseQuantizer,
-        Tensor<float, 2, true>& vecs,
-        Tensor<Index::idx_t, 1, true>& indices) {
-
-    auto vecs_view = raft::make_device_matrix_view<const float, Index::idx_t>(vecs.data(), vecs.getSize(0), dim_);
-    auto inds_view = raft::make_device_vector_view<const Index::idx_t, Index::idx_t>(indices.data(), (Index::idx_t )indices.getSize(0));
-
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
-
-    // TODO: We probably don't want to ignore the coarse quantizer here
-    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
-            raft_handle,
-            raft_knn_index.value(),
-            vecs_view,
-            std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
-}
-
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
deleted file mode 100644
index f30f34259f..0000000000
--- a/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu
+++ /dev/null
@@ -1,419 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/impl/raft/RaftIndexIVFPQ.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/utils/utils.h>
-#include <faiss/gpu/impl/IVFPQ.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-
-#include <raft/spatial/knn/ivf_pq.cuh>
-#include <raft/spatial/knn/ivf_pq_types.hpp>
-
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-/**
- *     GpuIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            int dims,
-            int nlist,
-            int subQuantizers,
-            int bitsPerCode,
-            faiss::MetricType metric,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig())
- * @param provider
- * @param index
- * @param config
- */
-RaftIndexIVFPQ::RaftIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        const faiss::IndexIVFPQ* index,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVFPQ(provider, index, config),
-          pq(index->pq),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(0),
-          bitsPerCode_(0),
-          reserveMemoryVecs_(0) {
-    copyFrom(index);
-}
-
-RaftIndexIVFPQ::RaftIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        int dims,
-        int nlist,
-        int subQuantizers,
-        int bitsPerCode,
-        faiss::MetricType metric,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVFPQ(provider, dims, nlist, subQuantizers, bitsPerCode,  metric, config),
-          pq(dims, subQuantizers, bitsPerCode),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(subQuantizers),
-          bitsPerCode_(bitsPerCode),
-          reserveMemoryVecs_(0) {
-    verifySettings_();
-
-    // We haven't trained ourselves, so don't construct the PQ index yet
-    this->is_trained = false;
-}
-
-RaftIndexIVFPQ::RaftIndexIVFPQ(
-        GpuResourcesProvider* provider,
-        Index *coarse_quantizer,
-        int dims,
-        int nlist,
-        int subQuantizers,
-        int bitsPerCode,
-        faiss::MetricType metric,
-        GpuIndexIVFPQConfig config)
-        : GpuIndexIVFPQ(provider, coarse_quantizer, dims, nlist, subQuantizers, bitsPerCode,  metric, config),
-          pq(dims, subQuantizers, bitsPerCode),
-          ivfpqConfig_(config),
-          usePrecomputedTables_(config.usePrecomputedTables),
-          subQuantizers_(subQuantizers),
-          bitsPerCode_(bitsPerCode),
-          reserveMemoryVecs_(0) {
-    verifySettings_();
-
-    // We haven't trained ourselves, so don't construct the PQ index yet
-    this->is_trained = false;
-}
-
-RaftIndexIVFPQ::~RaftIndexIVFPQ() {}
-
-void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
-//    DeviceScope scope(config_.device);
-//
-//    GpuIndexIVF::copyFrom(index);
-//
-//    // Clear out our old data
-//    index_.reset();
-//
-//    pq = index->pq;
-//    subQuantizers_ = index->pq.M;
-//    bitsPerCode_ = index->pq.nbits;
-//
-//    // We only support this
-//    FAISS_THROW_IF_NOT_MSG(
-//            ivfpqConfig_.interleavedLayout || index->pq.nbits == 8,
-//            "GPU: only pq.nbits == 8 is supported");
-//    FAISS_THROW_IF_NOT_MSG(
-//            index->by_residual, "GPU: only by_residual = true is supported");
-//    FAISS_THROW_IF_NOT_MSG(
-//            index->polysemous_ht == 0, "GPU: polysemous codes not supported");
-//
-//    verifySettings_();
-//
-//    // The other index might not be trained
-//    if (!index->is_trained) {
-//        // copied in GpuIndex::copyFrom
-//        FAISS_ASSERT(!is_trained);
-//        return;
-//    }
-//
-//    // Copy our lists as well
-//    // The product quantizer must have data in it
-//    FAISS_ASSERT(index->pq.centroids.size() > 0);
-//    index_.reset(new IVFPQ(
-//            resources_.get(),
-//            index->metric_type,
-//            index->metric_arg,
-//            quantizer->getGpuData(),
-//            subQuantizers_,
-//            bitsPerCode_,
-//            ivfpqConfig_.useFloat16LookupTables,
-//            ivfpqConfig_.useMMCodeDistance,
-//            ivfpqConfig_.interleavedLayout,
-//            (float*)index->pq.centroids.data(),
-//            ivfpqConfig_.indicesOptions,
-//            config_.memorySpace));
-//    // Doesn't make sense to reserve memory here
-//    index_->setPrecomputedCodes(usePrecomputedTables_);
-//
-//    // Copy all of the IVF data
-//    index_->copyInvertedListsFrom(index->invlists);
-}
-
-void RaftIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
-//    DeviceScope scope(config_.device);
-//
-//    // We must have the indices in order to copy to ourselves
-//    FAISS_THROW_IF_NOT_MSG(
-//            ivfpqConfig_.indicesOptions != INDICES_IVF,
-//            "Cannot copy to CPU as GPU index doesn't retain "
-//            "indices (INDICES_IVF)");
-//
-//    GpuIndexIVF::copyTo(index);
-//
-//    //
-//    // IndexIVFPQ information
-//    //
-//    index->by_residual = true;
-//    index->use_precomputed_table = 0;
-//    index->code_size = subQuantizers_;
-//    index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
-//
-//    index->do_polysemous_training = false;
-//    index->polysemous_training = nullptr;
-//
-//    index->scan_table_threshold = 0;
-//    index->max_codes = 0;
-//    index->polysemous_ht = 0;
-//    index->precomputed_table.clear();
-//
-//    auto ivf = new ArrayInvertedLists(nlist, index->code_size);
-//    index->replace_invlists(ivf, true);
-//
-//    if (index_) {
-//        // Copy IVF lists
-//        index_->copyInvertedListsTo(ivf);
-//
-//        // Copy PQ centroids
-//        auto devPQCentroids = index_->getPQCentroids();
-//        index->pq.centroids.resize(devPQCentroids.numElements());
-//
-//        fromDevice<float, 3>(
-//                devPQCentroids,
-//                index->pq.centroids.data(),
-//                resources_->getDefaultStream(config_.device));
-//
-//        if (usePrecomputedTables_) {
-//            index->precompute_table();
-//        }
-//    }
-}
-
-void RaftIndexIVFPQ::reserveMemory(size_t numVecs) {
-    reserveMemoryVecs_ = numVecs;
-    if (index_) {
-        DeviceScope scope(config_.device);
-        index_->reserveMemory(numVecs);
-    }
-}
-
-void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) {
-    usePrecomputedTables_ = enable;
-    if (index_) {
-        DeviceScope scope(config_.device);
-        index_->setPrecomputedCodes(quantizer, enable);
-    }
-
-    verifySettings_();
-}
-
-bool RaftIndexIVFPQ::getPrecomputedCodes() const {
-    return usePrecomputedTables_;
-}
-
-int RaftIndexIVFPQ::getNumSubQuantizers() const {
-    return subQuantizers_;
-}
-
-int RaftIndexIVFPQ::getBitsPerCode() const {
-    return bitsPerCode_;
-}
-
-int RaftIndexIVFPQ::getCentroidsPerSubQuantizer() const {
-    return utils::pow2(bitsPerCode_);
-}
-
-size_t RaftIndexIVFPQ::reclaimMemory() {
-    if (index_) {
-        DeviceScope scope(config_.device);
-        return index_->reclaimMemory();
-    }
-
-    return 0;
-}
-
-void RaftIndexIVFPQ::reset() {
-    if (raft_knn_index.has_value()) {
-        raft_knn_index.reset();
-        this->ntotal = 0;
-    } else {
-        FAISS_ASSERT(this->ntotal == 0);
-    }
-}
-
-void RaftIndexIVFPQ::train(Index::idx_t n, const float* x) {
-    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
-            "RaftIndexIVFFlat::train (%ld)", n);
-
-    std::cout << "Calling train() with " << n << " rows" << std::endl;
-
-    uint32_t start = raft::curTimeMillis();
-    if (this->is_trained) {
-        FAISS_ASSERT(raft_knn_index.has_value());
-        return;
-    }
-
-    raft::spatial::knn::ivf_pq::index_params raft_idx_params;
-    raft_idx_params.n_lists = nlist;
-    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-    raft_idx_params.add_data_on_build = false;
-    raft_idx_params.kmeans_n_iters = 100;
-
-    raft_knn_index.emplace(
-            raft::spatial::knn::ivf_pq::build(raft_handle, raft_idx_params,
-                                                const_cast<float*>(x),
-                                                n, (faiss::Index::idx_t)d));
-
-    raft_handle.sync_stream();
-    uint32_t stop = raft::curTimeMillis();
-
-    std::cout << "train took " << (stop - start) << "ms. " << std::endl;
-    this->is_trained = true;
-}
-
-void RaftIndexIVFPQ::addImpl_(int n, const float* x, const Index::idx_t* xids) {
-    // Device is already set in GpuIndex::add
-    FAISS_ASSERT(is_trained);
-    FAISS_ASSERT(n > 0);
-
-    // but keep the ntotal based on the total number of vectors that we
-    // attempted to add
-    std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
-
-    raft_knn_index.emplace(raft::spatial::knn::ivf_pq::extend(
-            raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n));
-    this->ntotal += n;
-
-    ntotal += n;
-}
-
-void RaftIndexIVFPQ::searchImpl_(
-        int n,
-        const float* x,
-        int k,
-        float* distances,
-        Index::idx_t* labels,
-        const SearchParameters *params) const {
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(index_);
-    FAISS_ASSERT(n > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
-
-    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
-            "RaftIndexIVFFlat::searchImpl_ (%ld)", n);
-
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(raft_knn_index.has_value());
-    FAISS_ASSERT(n > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
-
-    raft::spatial::knn::ivf_pq::search_params pams;
-    pams.n_probes = nprobe;
-    raft::spatial::knn::ivf_pq::search<float, faiss::Index::idx_t>(
-            raft_handle,
-            pams,
-            *raft_knn_index,
-            const_cast<float*>(x),
-            static_cast<std::uint32_t>(n),
-            static_cast<std::uint32_t>(k),
-            labels,
-            distances);
-
-    raft_handle.sync_stream();
-}
-
-int RaftIndexIVFPQ::getListLength(int listId) const {
-    FAISS_ASSERT(index_);
-    DeviceScope scope(config_.device);
-
-    return index_->getListLength(listId);
-}
-
-std::vector<uint8_t> RaftIndexIVFPQ::getListVectorData(
-        int listId,
-        bool gpuFormat) const {
-    FAISS_ASSERT(index_);
-    DeviceScope scope(config_.device);
-
-    return index_->getListVectorData(listId, gpuFormat);
-}
-
-std::vector<Index::idx_t> RaftIndexIVFPQ::getListIndices(int listId) const {
-    FAISS_ASSERT(index_);
-    DeviceScope scope(config_.device);
-
-    return index_->getListIndices(listId);
-}
-
-void RaftIndexIVFPQ::verifySettings_() const {
-    // Our implementation has these restrictions:
-
-    // Must have some number of lists
-    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
-
-    // up to a single byte per code
-    if (ivfpqConfig_.interleavedLayout) {
-        FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 ||
-                bitsPerCode_ == 8,
-                "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
-                bitsPerCode_);
-
-    } else {
-        FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 8,
-                "Bits per code must be 8 (passed %d)",
-                bitsPerCode_);
-    }
-
-    // Sub-quantizers must evenly divide dimensions available
-    FAISS_THROW_IF_NOT_FMT(
-            this->d % subQuantizers_ == 0,
-            "Number of sub-quantizers (%d) must be an "
-            "even divisor of the number of dimensions (%d)",
-            subQuantizers_,
-            this->d);
-
-    // The number of bytes per encoded vector must be one we support
-    FAISS_THROW_IF_NOT_FMT(
-            ivfpqConfig_.interleavedLayout ||
-            IVFPQ::isSupportedPQCodeLength(subQuantizers_),
-            "Number of bytes per encoded vector / sub-quantizers (%d) "
-            "is not supported",
-            subQuantizers_);
-
-    // We must have enough shared memory on the current device to store
-    // our lookup distances
-    int lookupTableSize = sizeof(float);
-    if (ivfpqConfig_.useFloat16LookupTables) {
-        lookupTableSize = sizeof(half);
-    }
-
-    // 64 bytes per code is only supported with usage of float16, at 2^8
-    // codes per subquantizer
-    size_t requiredSmemSize =
-            lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
-    size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
-
-    FAISS_THROW_IF_NOT_FMT(
-            requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
-            "Device %d has %zu bytes of shared memory, while "
-            "%d bits per code and %d sub-quantizers requires %zu "
-            "bytes. Consider useFloat16LookupTables and/or "
-            "reduce parameters",
-            config_.device,
-            smemPerBlock,
-            bitsPerCode_,
-            subQuantizers_,
-            requiredSmemSize);
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/impl/raft/RaftIndexIVFPQ.h b/faiss/gpu/impl/raft/RaftIndexIVFPQ.h
deleted file mode 100644
index e7f1b7515c..0000000000
--- a/faiss/gpu/impl/raft/RaftIndexIVFPQ.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/impl/ProductQuantizer.h>
-
-#include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ivf_pq_types.hpp>
-
-#include <memory>
-#include <vector>
-
-namespace faiss {
-struct IndexIVFPQ;
-}
-
-namespace faiss {
-namespace gpu {
-
-class GpuIndexFlat;
-class IVFPQ;
-
-/// RAFT IVFPQ index for the GPU
-class RaftIndexIVFPQ : public GpuIndexIVFPQ {
-   public:
-    /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying
-    /// data over to the given GPU, if the input index is trained.
-    RaftIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            const faiss::IndexIVFPQ* index,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-    /// Construct an empty index
-    RaftIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            int dims,
-            int nlist,
-            int subQuantizers,
-            int bitsPerCode,
-            faiss::MetricType metric,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-
-    /// Construct an empty index
-    RaftIndexIVFPQ(
-            GpuResourcesProvider* provider,
-            Index *coarse_quantizer,
-            int dims,
-            int nlist,
-            int subQuantizers,
-            int bitsPerCode,
-            faiss::MetricType metric,
-            GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
-
-    ~RaftIndexIVFPQ() override;
-
-    /// Reserve space on the GPU for the inverted lists for `num`
-    /// vectors, assumed equally distributed among
-
-    /// Initialize ourselves from the given CPU index; will overwrite
-    /// all data in ourselves
-    void copyFrom(const faiss::IndexIVFPQ* index);
-
-    /// Copy ourselves to the given CPU index; will overwrite all data
-    /// in the index instance
-    void copyTo(faiss::IndexIVFPQ* index) const;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// Enable or disable pre-computed codes
-    void setPrecomputedCodes(bool enable);
-
-    /// Are pre-computed codes enabled?
-    bool getPrecomputedCodes() const;
-
-    /// Return the number of sub-quantizers we are using
-    int getNumSubQuantizers() const;
-
-    /// Return the number of bits per PQ code
-    int getBitsPerCode() const;
-
-    /// Return the number of centroids per PQ code (2^bits per code)
-    int getCentroidsPerSubQuantizer() const;
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
-    /// Clears out all inverted lists, but retains the coarse and
-    /// product centroid information
-    void reset() override;
-
-    /// Trains the coarse and product quantizer based on the given vector data
-    void train(Index::idx_t n, const float* x) override;
-
-    /// Returns the number of vectors present in a particular inverted list
-    int getListLength(int listId) const override;
-
-    /// Return the encoded vector data contained in a particular inverted list,
-    /// for debugging purposes.
-    /// If gpuFormat is true, the data is returned as it is encoded in the
-    /// GPU-side representation.
-    /// Otherwise, it is converted to the CPU format.
-    /// compliant format, while the native GPU format may differ.
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
-    const override;
-
-    /// Return the vector indices contained in a particular inverted list, for
-    /// debugging purposes.
-    std::vector<Index::idx_t> getListIndices(int listId) const override;
-
-   public:
-    /// Like the CPU version, we expose a publically-visible ProductQuantizer
-    /// for manipulation
-    ProductQuantizer pq;
-
-   protected:
-    /// Called from GpuIndex for add/add_with_ids
-    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
-
-    /// Called from GpuIndex for search
-    void searchImpl_(
-            int n,
-            const float* x,
-            int k,
-            float* distances,
-            Index::idx_t* labels,
-            const SearchParameters *params) const override;
-
-    /// Throws errors if configuration settings are improper
-    void verifySettings_() const;
-
-   protected:
-    /// Our configuration options that we were initialized with
-    const GpuIndexIVFPQConfig ivfpqConfig_;
-
-    /// Runtime override: whether or not we use precomputed tables
-    bool usePrecomputedTables_;
-
-    /// Number of sub-quantizers per encoded vector
-    int subQuantizers_;
-
-    /// Bits per sub-quantizer code
-    int bitsPerCode_;
-
-    /// Desired inverted list memory reservation
-    size_t reserveMemoryVecs_;
-
-    /// The product quantizer instance that we own; contains the
-    /// inverted lists
-    std::unique_ptr<IVFPQ> index_;
-
-    const raft::handle_t raft_handle;
-    std::optional<raft::spatial::knn::ivf_pq::index<Index::idx_t>> raft_knn_index{std::nullopt};
-};
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp
deleted file mode 100644
index c22c722a35..0000000000
--- a/faiss/gpu/impl/raft/RmmGpuResources.hpp
+++ /dev/null
@@ -1,656 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
-This code contains unnecessary code duplication. These could be deleted
-once the relevant changes would be made on the FAISS side. Indeed most of
-the logic in the below code is similar to FAISS's standard implementation
-and should thus be inherited instead of duplicated. This FAISS's issue
-once solved should allow the removal of the unnecessary duplicates
-in this file : https://github.com/facebookresearch/faiss/issues/2097
-*/
-
-#pragma once
-
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/StackDeviceMemory.h>
-#include <faiss/gpu/utils/StaticUtils.h>
-#include <faiss/impl/FaissAssert.h>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <map>
-#include <sstream>
-#include <unordered_map>
-#include <vector>
-
-#include <raft/core/handle.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/host/pinned_memory_resource.hpp>
-
-namespace faiss {
-namespace gpu {
-
-namespace {
-
-// How many streams per device we allocate by default (for multi-streaming)
-constexpr int kNumStreams = 2;
-
-// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
-constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 4 GiB memory GPUs
-constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
-
-// Default temporary memory allocation for <= 8 GiB memory GPUs
-constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
-
-// Maximum temporary memory allocation for all GPUs
-constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
-
-std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map)
-{
-    // Produce a sorted list of all outstanding allocations by type
-    std::unordered_map<AllocType, std::pair<int, size_t>> stats;
-
-    for (auto& entry : map) {
-        auto& a = entry.second;
-
-        auto it = stats.find(a.type);
-        if (it != stats.end()) {
-            stats[a.type].first++;
-            stats[a.type].second += a.size;
-        } else {
-            stats[a.type] = std::make_pair(1, a.size);
-        }
-    }
-
-    std::stringstream ss;
-    for (auto& entry : stats) {
-        ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first
-           << " allocations, " << entry.second.second << " bytes\n";
-    }
-
-    return ss.str();
-}
-
-}  // namespace
-
-/// RMM implementation of the GpuResources object that provides for a
-/// temporary memory manager
-class RmmGpuResourcesImpl : public GpuResources {
-   public:
-    RmmGpuResourcesImpl()
-            : pinnedMemAlloc_(nullptr),
-              pinnedMemAllocSize_(0),
-            // let the adjustment function determine the memory size for us by passing
-            // in a huge value that will then be adjusted
-              tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
-              pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-              allocLogging_(false),
-              cmr(new rmm::mr::cuda_memory_resource),
-              mmr(new rmm::mr::managed_memory_resource),
-              pmr(new rmm::mr::pinned_memory_resource){};
-
-    ~RmmGpuResourcesImpl()
-    {
-        // The temporary memory allocator has allocated memory through us, so clean
-        // that up before we finish fully de-initializing ourselves
-        tempMemory_.clear();
-
-        // Make sure all allocations have been freed
-        bool allocError = false;
-
-        for (auto& entry : allocs_) {
-            auto& map = entry.second;
-
-            if (!map.empty()) {
-                std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
-                          << "Device " << entry.first << " outstanding allocations:\n";
-                std::cerr << allocsToString(map);
-                allocError = true;
-            }
-        }
-
-        FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
-
-        for (auto& entry : defaultStreams_) {
-            DeviceScope scope(entry.first);
-
-            // We created these streams, so are responsible for destroying them
-            CUDA_VERIFY(cudaStreamDestroy(entry.second));
-        }
-
-        for (auto& entry : alternateStreams_) {
-            DeviceScope scope(entry.first);
-
-            for (auto stream : entry.second) {
-                CUDA_VERIFY(cudaStreamDestroy(stream));
-            }
-        }
-
-        for (auto& entry : asyncCopyStreams_) {
-            DeviceScope scope(entry.first);
-
-            CUDA_VERIFY(cudaStreamDestroy(entry.second));
-        }
-
-        for (auto& entry : blasHandles_) {
-            DeviceScope scope(entry.first);
-
-            auto blasStatus = cublasDestroy(entry.second);
-            FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-        }
-
-        if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); }
-    };
-
-    /// Disable allocation of temporary memory; all temporary memory
-    /// requests will call cudaMalloc / cudaFree at the point of use
-    void noTempMemory() { setTempMemory(0); };
-
-    /// Specify that we wish to use a certain fixed size of memory on
-    /// all devices as temporary memory. This is the upper bound for the GPU
-    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-    /// To avoid any temporary memory allocation, pass 0.
-    void setTempMemory(size_t size)
-    {
-        if (tempMemSize_ != size) {
-            // adjust based on general limits
-            tempMemSize_ = getDefaultTempMemForGPU(-1, size);
-
-            // We need to re-initialize memory resources for all current devices that
-            // have been initialized.
-            // This should be safe to do, even if we are currently running work, because
-            // the cudaFree call that this implies will force-synchronize all GPUs with
-            // the CPU
-            for (auto& p : tempMemory_) {
-                int device = p.first;
-                // Free the existing memory first
-                p.second.reset();
-
-                // Allocate new
-                p.second = std::unique_ptr<StackDeviceMemory>(
-                        new StackDeviceMemory(this,
-                                              p.first,
-                                // adjust for this specific device
-                                              getDefaultTempMemForGPU(device, tempMemSize_)));
-            }
-        }
-    };
-
-    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-    /// transfers
-    void setPinnedMemory(size_t size)
-    {
-        // Should not call this after devices have been initialized
-        FAISS_ASSERT(defaultStreams_.size() == 0);
-        FAISS_ASSERT(!pinnedMemAlloc_);
-
-        pinnedMemSize_ = size;
-    };
-
-    /// Called to change the stream for work ordering. We do not own `stream`;
-    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-    /// up.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    void setDefaultStream(int device, cudaStream_t stream)
-    {
-        if (isInitialized(device)) {
-            // A new series of calls may not be ordered with what was the previous
-            // stream, so if the stream being specified is different, then we need to
-            // ensure ordering between the two (new stream waits on old).
-            auto it                 = userDefaultStreams_.find(device);
-            cudaStream_t prevStream = nullptr;
-
-            if (it != userDefaultStreams_.end()) {
-                prevStream = it->second;
-            } else {
-                FAISS_ASSERT(defaultStreams_.count(device));
-                prevStream = defaultStreams_[device];
-            }
-
-            if (prevStream != stream) { streamWait({stream}, {prevStream}); }
-        }
-
-        userDefaultStreams_[device] = stream;
-    };
-
-    /// Revert the default stream to the original stream managed by this resources
-    /// object, in case someone called `setDefaultStream`.
-    void revertDefaultStream(int device)
-    {
-        if (isInitialized(device)) {
-            auto it = userDefaultStreams_.find(device);
-
-            if (it != userDefaultStreams_.end()) {
-                // There was a user stream set that we need to synchronize against
-                cudaStream_t prevStream = userDefaultStreams_[device];
-
-                FAISS_ASSERT(defaultStreams_.count(device));
-                cudaStream_t newStream = defaultStreams_[device];
-
-                streamWait({newStream}, {prevStream});
-            }
-        }
-
-        userDefaultStreams_.erase(device);
-    };
-
-    /// Returns the stream for the given device on which all Faiss GPU work is
-    /// ordered.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    cudaStream_t getDefaultStream(int device)
-    {
-        initializeForDevice(device);
-
-        auto it = userDefaultStreams_.find(device);
-        if (it != userDefaultStreams_.end()) {
-            // There is a user override stream set
-            return it->second;
-        }
-
-        // Otherwise, our base default stream
-        return defaultStreams_[device];
-    };
-
-    /// Called to change the work ordering streams to the null stream
-    /// for all devices
-    void setDefaultNullStreamAllDevices()
-    {
-        for (int dev = 0; dev < getNumDevices(); ++dev) {
-            setDefaultStream(dev, nullptr);
-        }
-    };
-
-    /// If enabled, will print every GPU memory allocation and deallocation to
-    /// standard output
-    void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
-
-   public:
-    /// Internal system calls
-
-    /// Initialize resources for this device
-    void initializeForDevice(int device)
-    {
-        if (isInitialized(device)) { return; }
-
-        // If this is the first device that we're initializing, create our
-        // pinned memory allocation
-        if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-            pinnedMemAlloc_     = pmr->allocate(pinnedMemSize_);
-            pinnedMemAllocSize_ = pinnedMemSize_;
-        }
-
-        FAISS_ASSERT(device < getNumDevices());
-        DeviceScope scope(device);
-
-        // Make sure that device properties for all devices are cached
-        auto& prop = getDeviceProperties(device);
-
-        // Also check to make sure we meet our minimum compute capability (3.0)
-        FAISS_ASSERT_FMT(prop.major >= 3,
-                         "Device id %d with CC %d.%d not supported, "
-                         "need 3.0+ compute capability",
-                         device,
-                         prop.major,
-                         prop.minor);
-
-        // Create streams
-        cudaStream_t defaultStream = 0;
-        CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
-
-        defaultStreams_[device] = defaultStream;
-
-        cudaStream_t asyncCopyStream = 0;
-        CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
-
-        asyncCopyStreams_[device] = asyncCopyStream;
-
-        std::vector<cudaStream_t> deviceStreams;
-        for (int j = 0; j < kNumStreams; ++j) {
-            cudaStream_t stream = 0;
-            CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-            deviceStreams.push_back(stream);
-        }
-
-        alternateStreams_[device] = std::move(deviceStreams);
-
-        // Create cuBLAS handle
-
-        // TODO: We need to be able to use this cublas handle within the raft handle
-        cublasHandle_t blasHandle = 0;
-        auto blasStatus           = cublasCreate(&blasHandle);
-        FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
-        blasHandles_[device] = blasHandle;
-
-        // For CUDA 10 on V100, enabling tensor core usage would enable automatic
-        // rounding down of inputs to f16 (though accumulate in f32) which results in
-        // unacceptable loss of precision in general.
-        // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
-        // a loss of precision.
-#if CUDA_VERSION >= 11000
-        cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-#endif
-
-        FAISS_ASSERT(allocs_.count(device) == 0);
-        allocs_[device] = std::unordered_map<void*, AllocRequest>();
-
-        FAISS_ASSERT(tempMemory_.count(device) == 0);
-        auto mem = std::unique_ptr<StackDeviceMemory>(
-                new StackDeviceMemory(this,
-                                      device,
-                        // adjust for this specific device
-                                      getDefaultTempMemForGPU(device, tempMemSize_)));
-
-        tempMemory_.emplace(device, std::move(mem));
-    };
-
-    cublasHandle_t getBlasHandle(int device)
-    {
-        initializeForDevice(device);
-        return blasHandles_[device];
-    };
-
-    std::vector<cudaStream_t> getAlternateStreams(int device)
-    {
-        initializeForDevice(device);
-        return alternateStreams_[device];
-    };
-
-    /// Allocate non-temporary GPU memory
-    void* allocMemory(const AllocRequest& req)
-    {
-        initializeForDevice(req.device);
-
-        // We don't allocate a placeholder for zero-sized allocations
-        if (req.size == 0) { return nullptr; }
-
-        // Make sure that the allocation is a multiple of 16 bytes for alignment
-        // purposes
-        auto adjReq = req;
-        adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
-
-        void* p = nullptr;
-
-        if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; }
-
-        if (adjReq.space == MemorySpace::Temporary) {
-            // If we don't have enough space in our temporary memory manager, we need
-            // to allocate this request separately
-            auto& tempMem = tempMemory_[adjReq.device];
-
-            if (adjReq.size > tempMem->getSizeAvailable()) {
-                // We need to allocate this ourselves
-                AllocRequest newReq = adjReq;
-                newReq.space        = MemorySpace::Device;
-                newReq.type         = AllocType::TemporaryMemoryOverflow;
-
-                return allocMemory(newReq);
-            }
-
-            // Otherwise, we can handle this locally
-            p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-
-        } else if (adjReq.space == MemorySpace::Device) {
-            p = cmr->allocate(adjReq.size, adjReq.stream);
-        } else if (adjReq.space == MemorySpace::Unified) {
-            p = mmr->allocate(adjReq.size, adjReq.stream);
-        } else {
-            FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
-        }
-
-        allocs_[adjReq.device][p] = adjReq;
-
-        return p;
-    };
-
-    /// Returns a previous allocation
-    void deallocMemory(int device, void* p)
-    {
-        FAISS_ASSERT(isInitialized(device));
-
-        if (!p) { return; }
-
-        auto& a = allocs_[device];
-        auto it = a.find(p);
-        FAISS_ASSERT(it != a.end());
-
-        auto& req = it->second;
-
-        if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
-
-        if (req.space == MemorySpace::Temporary) {
-            tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-        } else if (req.space == MemorySpace::Device) {
-            cmr->deallocate(p, req.size, req.stream);
-        } else if (req.space == MemorySpace::Unified) {
-            mmr->deallocate(p, req.size, req.stream);
-        } else {
-            FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
-        }
-
-        a.erase(it);
-    };
-
-    size_t getTempMemoryAvailable(int device) const
-    {
-        FAISS_ASSERT(isInitialized(device));
-
-        auto it = tempMemory_.find(device);
-        FAISS_ASSERT(it != tempMemory_.end());
-
-        return it->second->getSizeAvailable();
-    };
-
-    /// Export a description of memory used for Python
-    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-    {
-        using AT = std::map<std::string, std::pair<int, size_t>>;
-
-        std::map<int, AT> out;
-
-        for (auto& entry : allocs_) {
-            AT outDevice;
-
-            for (auto& a : entry.second) {
-                auto& v = outDevice[allocTypeToString(a.second.type)];
-                v.first++;
-                v.second += a.second.size;
-            }
-
-            out[entry.first] = std::move(outDevice);
-        }
-
-        return out;
-    };
-
-    std::pair<void*, size_t> getPinnedMemory()
-    {
-        return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
-    };
-
-    cudaStream_t getAsyncCopyStream(int device)
-    {
-        initializeForDevice(device);
-        return asyncCopyStreams_[device];
-    };
-
-   private:
-    /// Have GPU resources been initialized for this device yet?
-    bool isInitialized(int device) const
-    {
-        // Use default streams as a marker for whether or not a certain
-        // device has been initialized
-        return defaultStreams_.count(device) != 0;
-    };
-
-    raft::handle_t &getRaftHandle(int device) {
-        initializeForDevice(device);
-
-        auto it = raftHandles_.find(device);
-        if (it != raftHandles_.end()) {
-            // There is a user override handle set
-            return it->second;
-        }
-
-        // Otherwise, our base default handle
-        return raftHandles_[device];
-    }
-
-    /// Adjust the default temporary memory allocation based on the total GPU
-    /// memory size
-    static size_t getDefaultTempMemForGPU(int device, size_t requested)
-    {
-        auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
-                                     : std::numeric_limits<size_t>::max();
-
-        if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
-            // If the GPU has <= 4 GiB of memory, reserve 512 MiB
-
-            if (requested > k4GiBTempMem) { return k4GiBTempMem; }
-        } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
-            // If the GPU has <= 8 GiB of memory, reserve 1 GiB
-
-            if (requested > k8GiBTempMem) { return k8GiBTempMem; }
-        } else {
-            // Never use more than 1.5 GiB
-            if (requested > kMaxTempMem) { return kMaxTempMem; }
-        }
-
-        // use whatever lower limit the user requested
-        return requested;
-    };
-
-   private:
-    /// Set of currently outstanding memory allocations per device
-    /// device -> (alloc request, allocated ptr)
-    std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
-
-    /// Temporary memory provider, per each device
-    std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
-
-    /// Our default stream that work is ordered on, one per each device
-    std::unordered_map<int, cudaStream_t> defaultStreams_;
-
-    /// This contains particular streams as set by the user for
-    /// ordering, if any
-    std::unordered_map<int, cudaStream_t> userDefaultStreams_;
-
-    /// Other streams we can use, per each device
-    std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
-
-    /// Async copy stream to use for GPU <-> CPU pinned memory copies
-    std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
-
-    /// cuBLAS handle for each device
-    std::unordered_map<int, cublasHandle_t> blasHandles_;
-
-    /// Pinned memory allocation for use with this GPU
-    void* pinnedMemAlloc_;
-    size_t pinnedMemAllocSize_;
-
-    /// Another option is to use a specified amount of memory on all
-    /// devices
-    size_t tempMemSize_;
-
-    /// Amount of pinned memory we should allocate
-    size_t pinnedMemSize_;
-
-    /// Whether or not we log every GPU memory allocation and deallocation
-    bool allocLogging_;
-
-    // cuda_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> cmr;
-
-    // managed_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> mmr;
-
-    // pinned_memory_resource
-    std::unique_ptr<rmm::mr::host_memory_resource> pmr;
-
-    /// Our raft handle that maintains additional library resources, one per each device
-    std::unordered_map<int, raft::handle_t> raftHandles_;
-
-};
-
-/// Default implementation of GpuResources that allocates a cuBLAS
-/// stream and 2 streams for use, as well as temporary memory.
-/// Internally, the Faiss GPU code uses the instance managed by getResources,
-/// but this is the user-facing object that is internally reference counted.
-class RmmGpuResources : public GpuResourcesProvider {
-   public:
-    RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
-
-    ~RmmGpuResources(){};
-
-    std::shared_ptr<GpuResources> getResources() { return res_; };
-
-    /// Disable allocation of temporary memory; all temporary memory
-    /// requests will call cudaMalloc / cudaFree at the point of use
-    void noTempMemory() { res_->noTempMemory(); };
-
-    /// Specify that we wish to use a certain fixed size of memory on
-    /// all devices as temporary memory. This is the upper bound for the GPU
-    /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
-    /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
-    /// To avoid any temporary memory allocation, pass 0.
-    void setTempMemory(size_t size) { res_->setTempMemory(size); };
-
-    /// Set amount of pinned memory to allocate, for async GPU <-> CPU
-    /// transfers
-    void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
-
-    /// Called to change the stream for work ordering. We do not own `stream`;
-    /// i.e., it will not be destroyed when the GpuResources object gets cleaned
-    /// up.
-    /// We are guaranteed that all Faiss GPU work is ordered with respect to
-    /// this stream upon exit from an index or other Faiss GPU call.
-    void setDefaultStream(int device, cudaStream_t stream)
-    {
-        res_->setDefaultStream(device, stream);
-    };
-
-    /// Revert the default stream to the original stream managed by this resources
-    /// object, in case someone called `setDefaultStream`.
-    void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
-
-    /// Called to change the work ordering streams to the null stream
-    /// for all devices
-    void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); };
-
-    /// Export a description of memory used for Python
-    std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
-    {
-        return res_->getMemoryInfo();
-    };
-
-    /// Returns the current default stream
-    cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); };
-
-    /// Returns the current amount of temp memory available
-    size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); };
-
-    /// Synchronize our default stream with the CPU
-    void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); };
-
-    /// If enabled, will print every GPU memory allocation and deallocation to
-    /// standard output
-    void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); };
-
-   private:
-    std::shared_ptr<RmmGpuResourcesImpl> res_;
-};
-
-}  // namespace gpu
-}  // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index f67f037082..a1ea05d64d 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -143,6 +143,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.use_raft = true;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
index 21ac260887..615ac01fe4 100644
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
@@ -7,8 +7,8 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/impl/raft/RaftIndexIVFFlat.h>
-#include <faiss/gpu/impl/raft/RmmGpuResources.hpp>
+#include <faiss/gpu/impl/RaftIndexIVFFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 
@@ -91,7 +91,7 @@ void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists,
 
 
 
-    faiss::gpu::RmmGpuResources gpu_res;
+    faiss::gpu::StandardGpuResources gpu_res;
     gpu_res.setDefaultStream(opt.device, raft_handle.get_stream());
 
     rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
@@ -139,7 +139,7 @@ void queryTest(
         std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
 
         printf("Creating rmm resources\n");
-        faiss::gpu::RmmGpuResources res;
+        faiss::gpu::StandardGpuResources res;
         res.noTempMemory();
 
         faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -267,7 +267,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         cpuIndex.train(opt.numTrain, trainVecs.data());
         cpuIndex.nprobe = opt.nprobe;
 
-        faiss::gpu::RmmGpuResources res;
+        faiss::gpu::StandardGpuResources res;
         res.noTempMemory();
 
         faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -302,7 +302,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-    faiss::gpu::RmmGpuResources res;
+    faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -361,7 +361,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
     cpuIndex.add(opt.numAdd, addVecs.data());
 
     // use garbage values to see if we overwrite then
-    faiss::gpu::RmmGpuResources res;
+    faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -508,7 +508,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
 //    cpuIndex.add(opt.numAdd, addVecs.data());
 //    cpuIndex.nprobe = opt.nprobe;
 //
-//    faiss::gpu::RmmGpuResources res;
+//    faiss::gpu::StandardGpuResources res;
 //    res.noTempMemory();
 //
 //    faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -557,7 +557,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
 //     opt.dim); std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd,
 //     opt.dim);
 
-//     faiss::gpu::RmmGpuResources res;
+//     faiss::gpu::StandardGpuResources res;
 //     res.noTempMemory();
 
 //     faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -596,7 +596,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
 // TEST(TestRaftIndexIVFFlat, AddNaN) {
 //     Options opt;
 
-//     faiss::gpu::RmmGpuResources res;
+//     faiss::gpu::StandardGpuResources res;
 //     res.noTempMemory();
 
 //     faiss::gpu::GpuIndexIVFFlatConfig config;
@@ -670,7 +670,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
 //    cpuIndex.add(numAdd, addVecs.data());
 //    cpuIndex.nprobe = nprobe;
 //
-//    faiss::gpu::RmmGpuResources res;
+//    faiss::gpu::StandardGpuResources res;
 //    res.noTempMemory();
 //
 //    faiss::gpu::GpuIndexIVFFlatConfig config;
diff --git a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp
deleted file mode 100644
index 61a3c8870e..0000000000
--- a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp
+++ /dev/null
@@ -1,704 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/raft/RaftIndexIVFPQ.h>
-#include <faiss/gpu/raft/RmmGpuResources.hpp>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/nvtx.hpp>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-// FIXME: figure out a better way to test fp16
-constexpr float kF16MaxRelErr = 0.3f;
-constexpr float kF32MaxRelErr = 0.03f;
-
-struct Options {
-    Options() {
-        numAdd = 2 * faiss::gpu::randVal(50000, 70000);
-        dim = faiss::gpu::randVal(64, 200);
-
-        numCentroids = std::sqrt((float)numAdd / 2);
-        numTrain = numCentroids * 50;
-        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
-        numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100);
-
-        // Due to the approximate nature of the query and of floating point
-        // differences between GPU and CPU, to stay within our error bounds,
-        // only use a small k
-        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
-        indicesOpt = faiss::gpu::randSelect(
-                {faiss::gpu::INDICES_CPU,
-                 faiss::gpu::INDICES_32_BIT,
-                 faiss::gpu::INDICES_64_BIT});
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
-            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
-            << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt;
-
-        return str.str();
-    }
-
-    int numAdd;
-    int dim;
-    int numCentroids;
-    int numTrain;
-    int nprobe;
-    int numQuery;
-    int k;
-    int device;
-    faiss::gpu::IndicesOptions indicesOpt;
-};
-
-template<typename idx_type>
-void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
-
-    uint32_t train_start = raft::curTimeMillis();
-    index.train(opt.numTrain, trainVecs.data());
-    raft_handle.sync_stream();
-    uint32_t train_stop = raft::curTimeMillis();
-
-    uint32_t add_start = raft::curTimeMillis();
-    index.add(opt.numAdd, addVecs.data());
-    raft_handle.sync_stream();
-    uint32_t add_stop = raft::curTimeMillis();
-//    index.train(opt.numTrain, trainVecs.data());
-    index.setNumProbes(opt.nprobe);
-
-    std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl;
-}
-
-
-void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m,
-                  std::vector<float> &addVecs, std::vector<float> &queryVecs) {
-
-
-
-    faiss::gpu::RmmGpuResources gpu_res;
-    gpu_res.setDefaultStream(opt.device, raft_handle.get_stream());
-
-    rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
-    raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
-
-    rmm::device_uvector<float> queryVecsDev(queryVecs.size(), raft_handle.get_stream());
-    raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream());
-
-    faiss::gpu::GpuDistanceParams args;
-    args.metric          = m;
-    args.k               = opt.k;
-    args.dims            = opt.dim;
-    args.vectors         = addVecs.data();
-    args.vectorsRowMajor = true;
-    args.numVectors      = opt.numAdd;
-    args.queries         = queryVecs.data();
-    args.queriesRowMajor = true;
-    args.numQueries      = opt.numQuery;
-    args.outDistances    = dists;
-    args.outIndices      = inds;
-    args.outIndicesType  = faiss::gpu::IndicesDataType::I64;
-
-    /**
-     * @todo: Until FAISS supports pluggable allocation strategies,
-     * we will not reap the benefits of the pool allocator for
-     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-     */
-    bfKnn(&gpu_res, args);
-}
-
-void queryTest(
-        faiss::MetricType metricType,
-        bool useFloat16CoarseQuantizer,
-        int dimOverride = -1) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-
-        std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
-
-        printf("Creating rmm resources\n");
-        faiss::gpu::RmmGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-        // TODO: Since we are modifying the centroids when adding new vectors,
-        // the neighbors are no longer going to match completely between CPU
-        // and the RAFT indexes. We will probably want to perform a bfknn as
-        // ground truth and then compare the recall for both the RAFT and FAISS
-        // indices.
-
-        printf("Building raft index\n");
-        faiss::gpu::RaftIndexIVFPQ raftIndex(
-                &res, opt.dim, opt.numCentroids, metricType, config);
-
-        printf("Done.\n");
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(
-                &res, opt.dim, opt.numCentroids, metricType, config);
-
-
-        printf("Creating raft handle\n");
-        raft::handle_t raft_handle;
-        printf("Done\n");
-
-        std::cout << "Training raft index" << std::endl;
-        uint32_t r_train_start = raft::curTimeMillis();
-        train_index(raft_handle, opt, raftIndex, trainVecs, addVecs);
-        raft_handle.sync_stream();
-        uint32_t r_train_stop = raft::curTimeMillis();
-        std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
-
-        std::cout << "Training gpu index" << std::endl;
-        uint32_t g_train_start = raft::curTimeMillis();
-        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
-        raft_handle.sync_stream();
-        uint32_t g_train_stop = raft::curTimeMillis();
-        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
-
-        std::cout << "Computing ground truth" << std::endl;
-        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
-
-        std::cout << "Done." << std::endl;
-        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
-
-        rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        uint32_t rstart = raft::curTimeMillis();
-        raftIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                raft_dists.data(),
-                raft_inds.data());
-
-        raft_handle.sync_stream();
-        uint32_t rstop = raft::curTimeMillis();
-        std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
-
-        rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        uint32_t gstart = raft::curTimeMillis();
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                gpu_dists.data(),
-                gpu_inds.data());
-
-        raft_handle.sync_stream();
-        uint32_t gstop = raft::curTimeMillis();
-
-        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
-
-        // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
-
-        raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout);
-
-//        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
-//        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
-
-//
-//        bool compFloat16 = useFloat16CoarseQuantizer;
-//        faiss::gpu::compareIndices(
-//                cpuIndex,
-//                gpuIndex,
-//                opt.numQuery,
-//                opt.dim,
-//                opt.k,
-//                opt.toString(),
-//                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-//                // FIXME: the fp16 bounds are
-//                // useless when math (the accumulator) is
-//                // in fp16. Figure out another way to test
-//                compFloat16 ? 0.70f : 0.1f,
-//                compFloat16 ? 0.65f : 0.015f);
-    }
-}
-
-void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 quantizerL2(opt.dim);
-        faiss::IndexFlatIP quantizerIP(opt.dim);
-        faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                                  ? (faiss::Index*)&quantizerL2
-                                  : (faiss::Index*)&quantizerIP;
-
-        faiss::IndexIVFFlat cpuIndex(
-                quantizer, opt.dim, opt.numCentroids, metricType);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::RmmGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-        faiss::gpu::RaftIndexIVFPQ gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.setNumProbes(opt.nprobe);
-
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
-    }
-}
-
-void copyToTest(bool useFloat16CoarseQuantizer) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::gpu::RmmGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-    faiss::gpu::RaftIndexIVFPQ gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.setNumProbes(opt.nprobe);
-
-    // use garbage values to see if we overwrite then
-    faiss::IndexFlatL2 cpuQuantizer(1);
-    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
-    cpuIndex.nprobe = 1;
-
-    gpuIndex.copyTo(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-void copyFromTest(bool useFloat16CoarseQuantizer) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // use garbage values to see if we overwrite then
-    faiss::gpu::RmmGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFPQConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-    faiss::gpu::RaftIndexIVFPQ gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(1);
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-//TEST(TestRaftIndexIVFPQ, Float32_32_Add_L2) {
-//    addTest(faiss::METRIC_L2, false);
-//    printf("Finished addTest(faiss::METRIC_L2, false)\n");
-//}
-//
-//TEST(TestRaftIndexIVFPQ, Float32_32_Add_IP) {
-//    addTest(faiss::METRIC_INNER_PRODUCT, false);
-//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-//}
-//
-//TEST(TestRaftIndexIVFPQ, Float16_32_Add_L2) {
-//    addTest(faiss::METRIC_L2, true);
-//    printf("Finished addTest(faiss::METRIC_L2, true)\n");
-//}
-//
-//TEST(TestRaftIndexIVFPQ, Float16_32_Add_IP) {
-//    addTest(faiss::METRIC_INNER_PRODUCT, true);
-//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-//}
-
-//
-// General query tests
-//
-
-TEST(TestRaftIndexIVFPQ, Float32_Query_L2) {
-queryTest(faiss::METRIC_L2, false);
-printf("Finished queryTest(faiss::METRIC_L2, false);\n");
-}
-
-//TEST(TestRaftIndexIVFPQ, Float32_Query_IP) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-//}
-
-// float16 coarse quantizer
-
-TEST(TestRaftIndexIVFPQ, Float16_32_Query_L2) {
-queryTest(faiss::METRIC_L2, true);
-printf("Finished queryTest(faiss::METRIC_L2, true)\n");
-}
-
-//TEST(TestRaftIndexIVFPQ, Float16_32_Query_IP) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, true);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-//}
-
-//
-// There are IVF list scanning specializations for 64-d and 128-d that we
-// make sure we explicitly test here
-//
-
-TEST(TestRaftIndexIVFPQ, Float32_Query_L2_64) {
-queryTest(faiss::METRIC_L2, false, 64);
-printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n");
-}
-
-//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_64) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
-//}
-
-TEST(TestRaftIndexIVFPQ, Float32_Query_L2_128) {
-queryTest(faiss::METRIC_L2, false, 128);
-printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n");
-}
-
-//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_128) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
-//}
-
-//
-// Copy tests
-//
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFPQ, Float32_32_CopyTo) {
-//     copyToTest(false);
-//     printf("Finished copyToTest(false)\n");
-// }
-
-//TEST(TestRaftIndexIVFPQ, Float32_32_CopyFrom) {
-//    copyFromTest(false);
-//    printf("Finished copyFromTest(false)\n");
-//}
-
-//TEST(TestRaftIndexIVFPQ, Float32_negative) {
-//    Options opt;
-//
-//    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-//    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-//
-//    // Put all vecs on negative side
-//    for (auto& f : trainVecs) {
-//        f = std::abs(f) * -1.0f;
-//    }
-//
-//    for (auto& f : addVecs) {
-//        f *= std::abs(f) * -1.0f;
-//    }
-//
-//    faiss::IndexFlatIP quantizerIP(opt.dim);
-//    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
-//
-//    faiss::IndexIVFFlat cpuIndex(
-//            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
-//    cpuIndex.train(opt.numTrain, trainVecs.data());
-//    cpuIndex.add(opt.numAdd, addVecs.data());
-//    cpuIndex.nprobe = opt.nprobe;
-//
-//    faiss::gpu::RmmGpuResources res;
-//    res.noTempMemory();
-//
-//    faiss::gpu::GpuIndexIVFPQConfig config;
-//    config.device = opt.device;
-//    config.indicesOptions = opt.indicesOpt;
-//
-//    faiss::gpu::RaftIndexIVFPQ gpuIndex(
-//            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-//    gpuIndex.copyFrom(&cpuIndex);
-//    gpuIndex.setNumProbes(opt.nprobe);
-//
-//    // Construct a positive test set
-//    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-//
-//    // Put all vecs on positive size
-//    for (auto& f : queryVecs) {
-//        f = std::abs(f);
-//    }
-//
-//    bool compFloat16 = false;
-//    faiss::gpu::compareIndices(
-//            queryVecs,
-//            cpuIndex,
-//            gpuIndex,
-//            opt.numQuery,
-//            opt.dim,
-//            opt.k,
-//            opt.toString(),
-//            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-//            // FIXME: the fp16 bounds are
-//            // useless when math (the accumulator) is
-//            // in fp16. Figure out another way to test
-//            compFloat16 ? 0.99f : 0.1f,
-//            compFloat16 ? 0.65f : 0.015f);
-//}
-
-//
-// NaN tests
-//
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFPQ, QueryNaN) {
-//     Options opt;
-
-//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
-//     opt.dim); std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd,
-//     opt.dim);
-
-//     faiss::gpu::RmmGpuResources res;
-//     res.noTempMemory();
-
-//     faiss::gpu::GpuIndexIVFPQConfig config;
-//     config.device = opt.device;
-//     config.indicesOptions = opt.indicesOpt;
-//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-//     faiss::gpu::RaftIndexIVFPQ gpuIndex(
-//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-//     gpuIndex.setNumProbes(opt.nprobe);
-
-//     gpuIndex.train(opt.numTrain, trainVecs.data());
-//     gpuIndex.add(opt.numAdd, addVecs.data());
-
-//     int numQuery = 10;
-//     std::vector<float> nans(
-//             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-//     std::vector<float> distances(numQuery * opt.k, 0);
-//     std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
-
-//     gpuIndex.search(
-//             numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-//     for (int q = 0; q < numQuery; ++q) {
-//         for (int k = 0; k < opt.k; ++k) {
-//             EXPECT_EQ(indices[q * opt.k + k], -1);
-//             EXPECT_EQ(
-//                     distances[q * opt.k + k],
-//                     std::numeric_limits<float>::max());
-//         }
-//     }
-// }
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFPQ, AddNaN) {
-//     Options opt;
-
-//     faiss::gpu::RmmGpuResources res;
-//     res.noTempMemory();
-
-//     faiss::gpu::GpuIndexIVFPQConfig config;
-//     config.device = opt.device;
-//     config.indicesOptions = opt.indicesOpt;
-//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-//     faiss::gpu::RaftIndexIVFPQ gpuIndex(
-//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-//     gpuIndex.setNumProbes(opt.nprobe);
-
-//     int numNans = 10;
-//     std::vector<float> nans(
-//             numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-//     // Make one vector valid (not the first vector, in order to test offset
-//     // issues), which should actually add
-//     for (int i = 0; i < opt.dim; ++i) {
-//         nans[opt.dim + i] = i;
-//     }
-
-//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
-//     opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data());
-
-//     // should not crash
-//     EXPECT_EQ(gpuIndex.ntotal, 0);
-//     gpuIndex.add(numNans, nans.data());
-
-//     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery,
-//     opt.dim); std::vector<float> distance(opt.numQuery * opt.k, 0);
-//     std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
-
-//     // should not crash
-//     gpuIndex.search(
-//             opt.numQuery,
-//             queryVecs.data(),
-//             opt.k,
-//             distance.data(),
-//             indices.data());
-// }
-
-//TEST(TestRaftIndexIVFPQ, UnifiedMemory) {
-//    // Construct on a random device to test multi-device, if we have
-//    // multiple devices
-//    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-//
-//    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-//        return;
-//    }
-//
-//    int dim = 128;
-//
-//    int numCentroids = 256;
-//    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
-//    // so just perform a small test with data allocated in the unified
-//    // memory address space
-//    size_t numAdd = 10000;
-//    size_t numTrain = numCentroids * 40;
-//    int numQuery = 10;
-//    int k = 10;
-//    int nprobe = 8;
-//
-//    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-//    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-//
-//    faiss::IndexFlatL2 quantizer(dim);
-//    faiss::IndexIVFFlat cpuIndex(
-//            &quantizer, dim, numCentroids, faiss::METRIC_L2);
-//
-//    cpuIndex.train(numTrain, trainVecs.data());
-//    cpuIndex.add(numAdd, addVecs.data());
-//    cpuIndex.nprobe = nprobe;
-//
-//    faiss::gpu::RmmGpuResources res;
-//    res.noTempMemory();
-//
-//    faiss::gpu::GpuIndexIVFPQConfig config;
-//    config.device = device;
-//    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-//
-//    faiss::gpu::RaftIndexIVFPQ gpuIndex(
-//            &res, dim, numCentroids, faiss::METRIC_L2, config);
-//    gpuIndex.copyFrom(&cpuIndex);
-//    gpuIndex.setNumProbes(nprobe);
-//
-//    faiss::gpu::compareIndices(
-//            cpuIndex,
-//            gpuIndex,
-//            numQuery,
-//            dim,
-//            k,
-//            "Unified Memory",
-//            kF32MaxRelErr,
-//            0.1f,
-//            0.015f);
-//}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}

From 2ac5a5b565c4120b008b66e6559d269007c3634a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 27 Oct 2022 20:34:39 -0400
Subject: [PATCH 37/87] Validating dispatch of flatindex

---
 faiss/gpu/GpuIndexFlat.cu           | 2 ++
 faiss/gpu/impl/FlatIndex.cuh        | 2 +-
 faiss/gpu/impl/RaftFlatIndex.cu     | 3 +++
 faiss/gpu/impl/RaftFlatIndex.cuh    | 2 +-
 faiss/gpu/test/TestGpuIndexFlat.cpp | 6 ++++++
 5 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index 429eb64db7..3f2f1b2960 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -91,6 +91,7 @@ GpuIndexFlat::~GpuIndexFlat() {}
 void GpuIndexFlat::resetIndex_(int dims) {
 
     if(config_.use_raft) {
+        printf("Should use raft!\n");
         data_.reset(new RaftFlatIndex(
                 resources_.get(),
                 dims,
@@ -98,6 +99,7 @@ void GpuIndexFlat::resetIndex_(int dims) {
                 config_.memorySpace));
 
     } else {
+        printf("Not using raft :-(\n");
         data_.reset(new FlatIndex(
                 resources_.get(),
                 dims,
diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh
index 56fbe609b9..5e0979d07a 100644
--- a/faiss/gpu/impl/FlatIndex.cuh
+++ b/faiss/gpu/impl/FlatIndex.cuh
@@ -44,7 +44,7 @@ class FlatIndex {
     /// Returns a reference to our vectors currently in use (if useFloat16 mode)
     Tensor<half, 2, true>& getVectorsFloat16Ref();
 
-    void query(
+    virtual void query(
             Tensor<float, 2, true>& vecs,
             int k,
             faiss::MetricType metric,
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index 1a8369a1c3..93fecdbed9 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -48,6 +48,7 @@ void RaftFlatIndex::query(
 
 //        raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance);
 
+        printf("Using RAFT for FLAT!!!!\n");
         // TODO: Expose the fused L2KNN through RAFT's public APIs
         raft::spatial::knn::detail::fusedL2Knn(dim_,
                    inds.data_handle(),
@@ -63,6 +64,8 @@ void RaftFlatIndex::query(
                    distance);
 
         } else {
+
+            printf("Dispathing to FAISS for FLAT!!!!\n");
         FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance);
     }
 }
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index 8a18053449..ad48102254 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -34,7 +34,7 @@ class RaftFlatIndex : public FlatIndex {
             float metricArg,
             Tensor<float, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
-            bool exactDistance);
+            bool exactDistance) override;
 
 };
 
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 50a445092c..18c9c81b80 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -71,6 +71,7 @@ void testFlat(const TestFlatOptions& opt) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
+    config.use_raft = true;
     config.useFloat16 = opt.useFloat16;
 
     faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
@@ -207,6 +208,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = 0;
+    config.use_raft = true;
     config.useFloat16 = false;
 
     int dim = 128;
@@ -249,6 +251,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
+    config.use_raft = true;
     config.useFloat16 = false;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
@@ -282,6 +285,7 @@ TEST(TestGpuIndexFlat, CopyTo) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
+    config.use_raft = true;
     config.useFloat16 = false;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
@@ -335,6 +339,7 @@ TEST(TestGpuIndexFlat, UnifiedMemory) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
+    config.use_raft = true;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
@@ -376,6 +381,7 @@ TEST(TestGpuIndexFlat, Residual) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
+    config.use_raft = true;
 
     int dim = 32;
     faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);

From 68944a5c25611337925ba7ea07c1d7d5270785cb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Nov 2022 12:25:53 -0500
Subject: [PATCH 38/87] 1. Verified FlatIndex tests are passing (and using RAFT
 for k<=64 L2 distance) 2. Verified IVF-flat addTests are returning expected
 results (though failing assertion as a result of the centroids being
 modified) Todo: Need to fill in addEncodedVectorsToList_ in RaftIVFFlat.cu in
 order to verify the remaining gtests

---
 faiss/gpu/GpuIndexIVFFlat.cu           |   2 +
 faiss/gpu/impl/IVFBase.cuh             |   2 +-
 faiss/gpu/impl/IVFFlat.cu              |   4 +-
 faiss/gpu/impl/RaftFlatIndex.cu        |   5 +-
 faiss/gpu/impl/RaftIVFFlat.cu          | 102 +++++++++++++++++++++++--
 faiss/gpu/impl/RaftIVFFlat.cuh         |  12 +++
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |   5 ++
 7 files changed, 121 insertions(+), 11 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 033c7189c9..59f6b58330 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -127,6 +127,8 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 }
 
 void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
+
+    printf("Inside copyFrom\n");
     DeviceScope scope(config_.device);
 
     // This will copy GpuIndexIVF data such as the coarse quantizer
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index e15c4c958d..a7d58178fb 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -111,7 +111,7 @@ class IVFBase {
    protected:
     /// Adds a set of codes and indices to a list, with the representation
     /// coming from the CPU equivalent
-    void addEncodedVectorsToList_(
+    virtual void addEncodedVectorsToList_(
             int listId,
             // resident on the host
             const void* codes,
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index a42e06cde3..b32047dc75 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
         int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
 
         // bytes to encode a block of 32 vectors (single dimension)
-        int bytesPerDimBlock = bits * 32 / 8;
+        int bytesPerDimBlock = bits * 32 / 8;  // = 128 if bits == 32
 
         // bytes to fully encode 32 vectors
         int bytesPerBlock = bytesPerDimBlock * dim_;
@@ -91,7 +91,9 @@ std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
         return codes;
     }
 
+    bool sc = scalarQ_ ? true : false;
     int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+    std::cout << "dim_=" << dim_ << ", scalarQ_=" <<  sc << ", bitsPerCode=" << bitsPerCode << ", interleavedLayout_=" << interleavedLayout_ << std::endl;
 
     auto up =
             unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index 93fecdbed9..f0283e2a00 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -34,8 +34,7 @@ void RaftFlatIndex::query(
         bool exactDistance) {
 
     // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used
-    if(k <= 64 && metric == MetricType::METRIC_L2 &&
-        input.getStride(0) == 0 && vectors_.getStride(0) == 0) {
+    if(k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) {
         raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
         auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded :
@@ -48,7 +47,6 @@ void RaftFlatIndex::query(
 
 //        raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance);
 
-        printf("Using RAFT for FLAT!!!!\n");
         // TODO: Expose the fused L2KNN through RAFT's public APIs
         raft::spatial::knn::detail::fusedL2Knn(dim_,
                    inds.data_handle(),
@@ -65,7 +63,6 @@ void RaftFlatIndex::query(
 
         } else {
 
-            printf("Dispathing to FAISS for FLAT!!!!\n");
         FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance);
     }
 }
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 5563ca2eae..f619ca7f45 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -210,16 +210,108 @@ void RaftIVFFlat::searchPreassigned(
     // TODO: Fill this in!
 }
 
-/// Copy all inverted lists from a CPU representation to ourselves
+
+
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-    printf("Inside RaftIVFFlat copyInvertedListsFrom\n");
+    size_t nlist = ivf ? ivf->nlist : 0;
+    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
+
+    printf("Inside RAFT copyInvertedListsFrom\n");
+    raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
+    // We need to allocate the IVF
+    printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal);
+
+    std::vector<std::uint32_t> list_sizes_(nlist);
+    std::vector<Index::idx_t> list_offsets_(nlist+1);
+    std::vector<Index::idx_t> indices_(ntotal);
+
+    raft::neighbors::ivf_flat::index_params raft_idx_params;
+    raft_idx_params.n_lists = nlist;
+    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+    raft_idx_params.add_data_on_build = false;
+    raft_idx_params.kmeans_n_iters = 100;
+
+    raft_knn_index.emplace(handle, raft_idx_params, dim_);
+    raft_knn_index.value().allocate(handle, ntotal, true);
 
-    ivf->print_stats();
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+
+        // GPU index can only support max int entries per list
+        FAISS_THROW_IF_NOT_FMT(
+                listSize <= (size_t)std::numeric_limits<int>::max(),
+                "GPU inverted list can only support "
+                "%zu entries; %zu found",
+                (size_t)std::numeric_limits<int>::max(),
+                listSize);
+
+        addEncodedVectorsToList_(
+                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+    }
+
+    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream());
+    raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream());
 
-    // TODO: Need to replicate copyInvertedListsFrom() in IVFBase.cu
-    // but populate a RAFT index.
 }
 
+void RaftIVFFlat::addEncodedVectorsToList_(
+        int listId,
+        const void* codes,
+        const Index::idx_t* indices,
+        size_t numVecs) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    // This list must already exist
+//    FAISS_ASSERT(listId < deviceListData_.size());
+
+    // This list must currently be empty
+//    auto& listCodes = deviceListData_[listId];
+//    FAISS_ASSERT(listCodes->data.size() == 0);
+//    FAISS_ASSERT(listCodes->numVecs == 0);
+
+    // If there's nothing to add, then there's nothing we have to do
+    if (numVecs == 0) {
+        return;
+    }
+
+    // The GPU might have a different layout of the memory
+    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
+    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
+
+    // We only have int32 length representations on the GPU per each
+    // list; the length is in sizeof(char)
+    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+
+    // Translate the codes as needed to our preferred form
+    std::vector<uint8_t> codesV(cpuListSizeInBytes);
+    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
+    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
+
+    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl;
+
+//    RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), ))
+
+//    listCodes->data.append(
+//            translatedCodes.data(),
+//            gpuListSizeInBytes,
+//            stream,
+//            true /* exact reserved size */);
+//    listCodes->numVecs = numVecs;
+//
+//    // Handle the indices as well
+//    addIndicesFromCpu_(listId, indices, numVecs);
+//
+
+      // We should problay consider using this...
+//    deviceListDataPointers_.setAt(
+//            listId, (void*)listCodes->data.data(), stream);
+//    deviceListLengths_.setAt(listId, (int)numVecs, stream);
+//
+//    // We update this as well, since the multi-pass algorithm uses it
+//    maxListLength_ = std::max(maxListLength_, (int)numVecs);
+}
+
+
 /// Copy all inverted lists from ourselves to a CPU representation
 void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
     printf("Inside RaftIVFFlat copyInvertedListsTo\n");
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 1078204270..0bee282d26 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -87,6 +87,18 @@ class RaftIVFFlat : public IVFFlat {
     void copyInvertedListsTo(InvertedLists* ivf) override;
 
    protected:
+
+    /// Adds a set of codes and indices to a list, with the representation
+    /// coming from the CPU equivalent
+    void addEncodedVectorsToList_(
+            int listId,
+            // resident on the host
+            const void* codes,
+            // resident on the host
+            const Index::idx_t* indices,
+            size_t numVecs) override;
+
+
     std::optional<raft::neighbors::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
 
 };
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index a1ea05d64d..503a655edd 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -93,11 +93,14 @@ void queryTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
+        config.use_raft = true;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
         gpuIndex.copyFrom(&cpuIndex);
+
+
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;
@@ -180,6 +183,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
+
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.train(opt.numTrain, trainVecs.data());
@@ -238,6 +242,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
     gpuIndex.setNumProbes(1);

From 3a37031d2415573a544991ac0797efc3865360de Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Nov 2022 13:58:47 -0500
Subject: [PATCH 39/87] Calling train() on copyFrom() with reconstructed
 vectors and filling in logic for updateQuantizer()

---
 faiss/gpu/GpuIndexIVF.cu       |   1 +
 faiss/gpu/GpuIndexIVFFlat.cu   |  29 +++-
 faiss/gpu/impl/IVFBase.cuh     |   2 +-
 faiss/gpu/impl/RaftIVFFlat.cu  | 241 +++++++++++++++++++--------------
 faiss/gpu/impl/RaftIVFFlat.cuh |  29 ++--
 5 files changed, 187 insertions(+), 115 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index bfd5f16c8d..a5706c7954 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -466,6 +466,7 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
 
     if(config_.use_raft) {
 
+        printf("Using raft to train quantizer for %d vectors\n", n);
         const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
         raft::neighbors::ivf_flat::index_params raft_idx_params;
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 59f6b58330..676d4e376d 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -160,8 +160,33 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             config_.memorySpace);
 
 
-    // Copy all of the IVF data
-    index_->copyInvertedListsFrom(index->invlists);
+    if(config_.use_raft) {
+
+        if(index->quantizer->ntotal > 0) {
+            auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
+            auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d);
+
+//        raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d);
+
+            // Copy (reconstructed) centroids over, rather than re-training
+            std::vector<float> buf_host(total_elems);
+            rmm::device_uvector<float> buf_device(total_elems, stream);
+            index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data());
+            raft::copy(buf_device.data(), buf_host.data(), total_elems, stream);
+
+            printf("Calling train!\n");
+            train(total_elems, buf_device.data());
+        }
+
+        if(index->ntotal > 0) {
+            std::vector<float> buf_host(index->ntotal);
+            index->reconstruct_n(0, index->ntotal, buf_host.data());
+            printf("Done reconstructing... %d\n", index->ntotal);
+        }
+    } else {
+        // Copy all of the IVF data
+        index_->copyInvertedListsFrom(index->invlists);
+    }
 }
 
 void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index a7d58178fb..60c69c1f8d 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -75,7 +75,7 @@ class IVFBase {
 
     /// Update our coarse quantizer with this quantizer instance; may be a CPU
     /// or GPU quantizer
-    void updateQuantizer(Index* quantizer);
+    virtual void updateQuantizer(Index* quantizer);
 
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index f619ca7f45..103f56bc4c 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -110,12 +110,20 @@ int RaftIVFFlat::addVectors(
 
     const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
+    printf("About to call extend on index\n");
     // TODO: We probably don't want to ignore the coarse quantizer here
-    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
-            raft_handle,
-            raft_knn_index.value(),
-            vecs_view,
-            std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
+
+    if(raft_knn_index.has_value()) {
+        raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
+                raft_handle,
+                raft_knn_index.value(),
+                vecs_view,
+                std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
+
+    } else {
+        printf("Index has not been trained!\n");
+    }
+    printf("Done.\n");
     return vecs.getSize(0);
 }
 
@@ -210,114 +218,149 @@ void RaftIVFFlat::searchPreassigned(
     // TODO: Fill this in!
 }
 
+void RaftIVFFlat::updateQuantizer(Index* quantizer) {
+    Index::idx_t quantizer_ntotal = quantizer->ntotal;
 
+    std::cout << "Calling updateQuantizer with trained index with "  << quantizer_ntotal << " items" << std::endl;
+    auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
 
-void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-    size_t nlist = ivf ? ivf->nlist : 0;
-    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
-
-    printf("Inside RAFT copyInvertedListsFrom\n");
-    raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
-    // We need to allocate the IVF
-    printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal);
-
-    std::vector<std::uint32_t> list_sizes_(nlist);
-    std::vector<Index::idx_t> list_offsets_(nlist+1);
-    std::vector<Index::idx_t> indices_(ntotal);
-
-    raft::neighbors::ivf_flat::index_params raft_idx_params;
-    raft_idx_params.n_lists = nlist;
-    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-    raft_idx_params.add_data_on_build = false;
-    raft_idx_params.kmeans_n_iters = 100;
+    auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
 
-    raft_knn_index.emplace(handle, raft_idx_params, dim_);
-    raft_knn_index.value().allocate(handle, ntotal, true);
+    raft::spatial::knn::ivf_flat::index_params pams;
 
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-
-        // GPU index can only support max int entries per list
-        FAISS_THROW_IF_NOT_FMT(
-                listSize <= (size_t)std::numeric_limits<int>::max(),
-                "GPU inverted list can only support "
-                "%zu entries; %zu found",
-                (size_t)std::numeric_limits<int>::max(),
-                listSize);
-
-        addEncodedVectorsToList_(
-                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+    switch (this->metric_) {
+        case faiss::METRIC_L2:
+            pams.metric = raft::distance::DistanceType::L2Expanded;
+            break;
+        case faiss::METRIC_INNER_PRODUCT:
+            pams.metric = raft::distance::DistanceType::InnerProduct;
+            break;
+        default:
+            FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream());
-    raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream());
-
-}
-
-void RaftIVFFlat::addEncodedVectorsToList_(
-        int listId,
-        const void* codes,
-        const Index::idx_t* indices,
-        size_t numVecs) {
-    auto stream = resources_->getDefaultStreamCurrentDevice();
+    raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, (uint32_t)this->dim_);
 
-    // This list must already exist
-//    FAISS_ASSERT(listId < deviceListData_.size());
-
-    // This list must currently be empty
-//    auto& listCodes = deviceListData_[listId];
-//    FAISS_ASSERT(listCodes->data.size() == 0);
-//    FAISS_ASSERT(listCodes->numVecs == 0);
-
-    // If there's nothing to add, then there's nothing we have to do
-    if (numVecs == 0) {
-        return;
+    // Copy (reconstructed) centroids over, rather than re-training
+    rmm::device_uvector<float> buf_dev(total_elems, stream);
+    {
+        std::vector<float> buf_host(total_elems);
+        quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
+        raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
     }
 
-    // The GPU might have a different layout of the memory
-    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
-
-    // We only have int32 length representations on the GPU per each
-    // list; the length is in sizeof(char)
-    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
-
-    // Translate the codes as needed to our preferred form
-    std::vector<uint8_t> codesV(cpuListSizeInBytes);
-    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
-    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
-
-    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl;
+    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
+}
 
-//    RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), ))
 
-//    listCodes->data.append(
-//            translatedCodes.data(),
-//            gpuListSizeInBytes,
-//            stream,
-//            true /* exact reserved size */);
-//    listCodes->numVecs = numVecs;
 //
-//    // Handle the indices as well
-//    addIndicesFromCpu_(listId, indices, numVecs);
 //
-
-      // We should problay consider using this...
-//    deviceListDataPointers_.setAt(
-//            listId, (void*)listCodes->data.data(), stream);
-//    deviceListLengths_.setAt(listId, (int)numVecs, stream);
+//void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+//    size_t nlist = ivf ? ivf->nlist : 0;
+//    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 //
-//    // We update this as well, since the multi-pass algorithm uses it
-//    maxListLength_ = std::max(maxListLength_, (int)numVecs);
-}
-
-
-/// Copy all inverted lists from ourselves to a CPU representation
-void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
-    printf("Inside RaftIVFFlat copyInvertedListsTo\n");
-
-    // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu
-}
+//    printf("Inside RAFT copyInvertedListsFrom\n");
+//    raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
+//    // We need to allocate the IVF
+//    printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal);
+//
+//    std::vector<std::uint32_t> list_sizes_(nlist);
+//    std::vector<Index::idx_t> list_offsets_(nlist+1);
+//    std::vector<Index::idx_t> indices_(ntotal);
+//
+//    raft::neighbors::ivf_flat::index_params raft_idx_params;
+//    raft_idx_params.n_lists = nlist;
+//    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+//    raft_idx_params.add_data_on_build = false;
+//    raft_idx_params.kmeans_n_iters = 100;
+//
+//    raft_knn_index.emplace(handle, raft_idx_params, dim_);
+//    raft_knn_index.value().allocate(handle, ntotal, true);
+//
+//    for (size_t i = 0; i < nlist; ++i) {
+//        size_t listSize = ivf->list_size(i);
+//
+//        // GPU index can only support max int entries per list
+//        FAISS_THROW_IF_NOT_FMT(
+//                listSize <= (size_t)std::numeric_limits<int>::max(),
+//                "GPU inverted list can only support "
+//                "%zu entries; %zu found",
+//                (size_t)std::numeric_limits<int>::max(),
+//                listSize);
+//
+//        addEncodedVectorsToList_(
+//                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+//    }
+//
+//    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream());
+//    raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream());
+//
+//}
+
+//void RaftIVFFlat::addEncodedVectorsToList_(
+//        int listId,
+//        const void* codes,
+//        const Index::idx_t* indices,
+//        size_t numVecs) {
+//    auto stream = resources_->getDefaultStreamCurrentDevice();
+//
+//    // This list must already exist
+////    FAISS_ASSERT(listId < deviceListData_.size());
+//
+//    // This list must currently be empty
+////    auto& listCodes = deviceListData_[listId];
+////    FAISS_ASSERT(listCodes->data.size() == 0);
+////    FAISS_ASSERT(listCodes->numVecs == 0);
+//
+//    // If there's nothing to add, then there's nothing we have to do
+//    if (numVecs == 0) {
+//        return;
+//    }
+//
+//    // The GPU might have a different layout of the memory
+//    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
+//    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
+//
+//    // We only have int32 length representations on the GPU per each
+//    // list; the length is in sizeof(char)
+//    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+//
+//    // Translate the codes as needed to our preferred form
+//    std::vector<uint8_t> codesV(cpuListSizeInBytes);
+//    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
+//    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
+//
+//    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl;
+//
+////    RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), ))
+//
+////    listCodes->data.append(
+////            translatedCodes.data(),
+////            gpuListSizeInBytes,
+////            stream,
+////            true /* exact reserved size */);
+////    listCodes->numVecs = numVecs;
+////
+////    // Handle the indices as well
+////    addIndicesFromCpu_(listId, indices, numVecs);
+////
+//
+//      // We should problay consider using this...
+////    deviceListDataPointers_.setAt(
+////            listId, (void*)listCodes->data.data(), stream);
+////    deviceListLengths_.setAt(listId, (int)numVecs, stream);
+////
+////    // We update this as well, since the multi-pass algorithm uses it
+////    maxListLength_ = std::max(maxListLength_, (int)numVecs);
+//}
+
+
+///// Copy all inverted lists from ourselves to a CPU representation
+//void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
+//    printf("Inside RaftIVFFlat copyInvertedListsTo\n");
+//
+//    // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu
+//}
 
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 0bee282d26..298a9370c9 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -80,23 +80,26 @@ class RaftIVFFlat : public IVFFlat {
     /// Return the encoded vectors of a particular list back to the CPU
     std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const override;
 
-    /// Copy all inverted lists from a CPU representation to ourselves
-    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+    void updateQuantizer(Index* quantizer) override;
 
-    /// Copy all inverted lists from ourselves to a CPU representation
-    void copyInvertedListsTo(InvertedLists* ivf) override;
+//
+//    /// Copy all inverted lists from a CPU representation to ourselves
+//    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+//
+//    /// Copy all inverted lists from ourselves to a CPU representation
+//    void copyInvertedListsTo(InvertedLists* ivf) override;
 
    protected:
 
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
-    void addEncodedVectorsToList_(
-            int listId,
-            // resident on the host
-            const void* codes,
-            // resident on the host
-            const Index::idx_t* indices,
-            size_t numVecs) override;
+//    /// Adds a set of codes and indices to a list, with the representation
+//    /// coming from the CPU equivalent
+//    void addEncodedVectorsToList_(
+//            int listId,
+//            // resident on the host
+//            const void* codes,
+//            // resident on the host
+//            const Index::idx_t* indices,
+//            size_t numVecs) override;
 
 
     std::optional<raft::neighbors::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};

From 3f51425926c866bcb16c45c2ffa0faf500027b4b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Nov 2022 14:11:17 -0500
Subject: [PATCH 40/87] IVFFlat gtests run through to completion without crash.
 Distances look very similar. It's possible indexes are being assigned
 incorrectly.

---
 faiss/gpu/GpuIndexIVFFlat.cu  | 33 ++++++++++++++++-----------------
 faiss/gpu/impl/RaftIVFFlat.cu | 10 ++++------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 676d4e376d..7d65ef5827 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -114,7 +114,6 @@ void GpuIndexIVFFlat::set_index_(GpuResources* resources,
 
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
     updateQuantizer();
-
 }
 
 void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
@@ -162,26 +161,26 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     if(config_.use_raft) {
 
-        if(index->quantizer->ntotal > 0) {
-            auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
-            auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d);
-
-//        raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d);
-
-            // Copy (reconstructed) centroids over, rather than re-training
-            std::vector<float> buf_host(total_elems);
-            rmm::device_uvector<float> buf_device(total_elems, stream);
-            index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data());
-            raft::copy(buf_device.data(), buf_host.data(), total_elems, stream);
-
-            printf("Calling train!\n");
-            train(total_elems, buf_device.data());
-        }
+//        if(index->quantizer->ntotal > 0) {
+//            auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
+//            auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d);
+//
+//            // Copy (reconstructed) centroids over, rather than re-training
+//            std::vector<float> buf_host(total_elems);
+//            rmm::device_uvector<float> buf_device(total_elems, stream);
+//            index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data());
+//            raft::copy(buf_device.data(), buf_host.data(), total_elems, stream);
+//
+//            printf("Calling train!\n");
+//            train(total_elems, buf_device.data());
+//        }
 
         if(index->ntotal > 0) {
-            std::vector<float> buf_host(index->ntotal);
+            printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal);
+            std::vector<float> buf_host(index->ntotal * index->d);
             index->reconstruct_n(0, index->ntotal, buf_host.data());
             printf("Done reconstructing... %d\n", index->ntotal);
+            add(index->ntotal, buf_host.data());
         }
     } else {
         // Copy all of the IVF data
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 103f56bc4c..02450ea441 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -243,13 +243,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     // Copy (reconstructed) centroids over, rather than re-training
     rmm::device_uvector<float> buf_dev(total_elems, stream);
-    {
-        std::vector<float> buf_host(total_elems);
-        quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-        raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
-    }
+    std::vector<float> buf_host(total_elems);
+    quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
+    raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
 
-    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
+    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout);
 }
 
 

From db1801e50f995d90ce49eb636d453c5a78818262 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 15 Nov 2022 17:09:02 -0500
Subject: [PATCH 41/87] Some of the IVFFlat tests are passing.

---
 cmake/thirdparty/get_raft.cmake    |  4 ++--
 faiss/gpu/GpuIndexIVFFlat.cu       | 16 +---------------
 faiss/gpu/impl/RaftIVFFlat.cu      |  8 ++++++--
 faiss/gpu/impl/RaftIndexIVFFlat.cu |  2 +-
 4 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 2b7825d193..a7ef8410da 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -16,8 +16,8 @@
 
 
 set(RAFT_VERSION "${RAPIDS_VERSION}")
-set(RAFT_FORK "cjnolet")
-set(RAFT_PINNED_TAG "bug-2212-ivf_flat_apis")
+set(RAFT_FORK "achirkin")
+set(RAFT_PINNED_TAG "fea-ivf-flat-optional-adaptive-centers")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 7d65ef5827..fb9849edb9 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -161,25 +161,11 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     if(config_.use_raft) {
 
-//        if(index->quantizer->ntotal > 0) {
-//            auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
-//            auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d);
-//
-//            // Copy (reconstructed) centroids over, rather than re-training
-//            std::vector<float> buf_host(total_elems);
-//            rmm::device_uvector<float> buf_device(total_elems, stream);
-//            index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data());
-//            raft::copy(buf_device.data(), buf_host.data(), total_elems, stream);
-//
-//            printf("Calling train!\n");
-//            train(total_elems, buf_device.data());
-//        }
-
+        // Quantizer should already have been updated above. Add reconstructed vectors to raft index
         if(index->ntotal > 0) {
             printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal);
             std::vector<float> buf_host(index->ntotal * index->d);
             index->reconstruct_n(0, index->ntotal, buf_host.data());
-            printf("Done reconstructing... %d\n", index->ntotal);
             add(index->ntotal, buf_host.data());
         }
     } else {
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 02450ea441..be40a65ca6 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -239,13 +239,17 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, (uint32_t)this->dim_);
+    raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_);
 
+    printf("Reconstructing\n");
     // Copy (reconstructed) centroids over, rather than re-training
     rmm::device_uvector<float> buf_dev(total_elems, stream);
     std::vector<float> buf_host(total_elems);
     quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-    raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
+
+    printf("Copying...\n");
+
+    raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
 
     raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout);
 }
diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu
index 88c5629e71..58d10fcc63 100644
--- a/faiss/gpu/impl/RaftIndexIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIndexIVFFlat.cu
@@ -126,7 +126,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
                 FAISS_THROW_MSG("Metric is not supported.");
         }
 
-        raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d);
+        raft_knn_index.emplace(raft_handle, pams.metric, false, (uint32_t)this->nlist, (uint32_t)this->d);
 
         // Copy (reconstructed) centroids over, rather than re-training
         rmm::device_uvector<float> buf_dev(total_elems, stream);

From f0bbd41fc25d09a8298a397c360cf1c1808bf6ae Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Nov 2022 10:39:11 -0500
Subject: [PATCH 42/87] CLeaning up the diff a bit

---
 faiss/gpu/CMakeLists.txt                |   2 -
 faiss/gpu/GpuDistance.cu                |  85 ++-
 faiss/gpu/test/CMakeLists.txt           |   5 -
 faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 704 ------------------------
 4 files changed, 32 insertions(+), 764 deletions(-)
 delete mode 100644 faiss/gpu/test/TestRaftIndexIVFFlat.cpp

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 0e82af813c..1d81848317 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -166,11 +166,9 @@ set(FAISS_GPU_HEADERS
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
-          impl/RaftIndexIVFFlat.h
           impl/RaftFlatIndex.cuh
           impl/RaftIVFFlat.cuh)
   list(APPEND FAISS_GPU_SRC
-          impl/RaftIndexIVFFlat.cu
           impl/RaftFlatIndex.cu
           impl/RaftIVFFlat.cu)
 endif()
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index ba1056f04a..8dca4043a6 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -14,11 +14,6 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 
-#ifdef FAISS_ENABLE_RAFT
-// TODO: Expose fused_l2_knn
-#include <raft/spatial/knn/brute_force_knn.cuh>
-#endif
-
 namespace faiss {
 namespace gpu {
 
@@ -107,31 +102,21 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
 
         // Since we've guaranteed that all arguments are on device, call the
         // implementation
-
-#ifdef FAISS_ENABLE_RAFT
-        // TODO: When k <= 64, invoke bfknn from RAFT
-        if (args.k <= 64) {
-
-        } else
-#endif
-
-        {
-            bfKnnOnDevice<T>(
-                    res,
-                    device,
-                    stream,
-                    tVectors,
-                    args.vectorsRowMajor,
-                    args.vectorNorms ? &tVectorNorms : nullptr,
-                    tQueries,
-                    args.queriesRowMajor,
-                    args.k,
-                    args.metric,
-                    args.metricArg,
-                    tOutDistances,
-                    tOutIntIndices,
-                    args.ignoreOutDistances);
-        }
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
         // Convert and copy int indices out
         auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
                 res,
@@ -160,29 +145,23 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                 stream,
                 {args.numQueries, args.k});
 
-#if defined FAISS_ENABLE_RAFT
-        if (args.k <= 64) {
-        } else
-#endif
-        {
-            // Since we've guaranteed that all arguments are on device, call the
-            // implementation
-            bfKnnOnDevice<T>(
-                    res,
-                    device,
-                    stream,
-                    tVectors,
-                    args.vectorsRowMajor,
-                    args.vectorNorms ? &tVectorNorms : nullptr,
-                    tQueries,
-                    args.queriesRowMajor,
-                    args.k,
-                    args.metric,
-                    args.metricArg,
-                    tOutDistances,
-                    tOutIntIndices,
-                    args.ignoreOutDistances);
-        }
+        // Since we've guaranteed that all arguments are on device, call the
+        // implementation
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
 
         // Copy back if necessary
         fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 3eb454c95f..251c501bea 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -29,11 +29,6 @@ faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
 
-
-if(FAISS_ENABLE_RAFT)
-  faiss_gpu_test(TestRaftIndexIVFFlat.cpp)
-endif()
-
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
 
diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
deleted file mode 100644
index 615ac01fe4..0000000000
--- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp
+++ /dev/null
@@ -1,704 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/impl/RaftIndexIVFFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/nvtx.hpp>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <sstream>
-#include <vector>
-
-// FIXME: figure out a better way to test fp16
-constexpr float kF16MaxRelErr = 0.3f;
-constexpr float kF32MaxRelErr = 0.03f;
-
-struct Options {
-    Options() {
-        numAdd = 2 * faiss::gpu::randVal(50000, 70000);
-        dim = faiss::gpu::randVal(64, 200);
-
-        numCentroids = std::sqrt((float)numAdd / 2);
-        numTrain = numCentroids * 50;
-        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
-        numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100);
-
-        // Due to the approximate nature of the query and of floating point
-        // differences between GPU and CPU, to stay within our error bounds,
-        // only use a small k
-        k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40);
-        indicesOpt = faiss::gpu::randSelect(
-                {faiss::gpu::INDICES_CPU,
-                 faiss::gpu::INDICES_32_BIT,
-                 faiss::gpu::INDICES_64_BIT});
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
-            << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
-            << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt;
-
-        return str.str();
-    }
-
-    int numAdd;
-    int dim;
-    int numCentroids;
-    int numTrain;
-    int nprobe;
-    int numQuery;
-    int k;
-    int device;
-    faiss::gpu::IndicesOptions indicesOpt;
-};
-
-template<typename idx_type>
-void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector<float> &trainVecs, std::vector<float> &addVecs) {
-
-    uint32_t train_start = raft::curTimeMillis();
-    index.train(opt.numTrain, trainVecs.data());
-    raft_handle.sync_stream();
-    uint32_t train_stop = raft::curTimeMillis();
-
-    uint32_t add_start = raft::curTimeMillis();
-    index.add(opt.numAdd, addVecs.data());
-    raft_handle.sync_stream();
-    uint32_t add_stop = raft::curTimeMillis();
-//    index.train(opt.numTrain, trainVecs.data());
-    index.setNumProbes(opt.nprobe);
-
-    std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl;
-}
-
-
-void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m,
-                  std::vector<float> &addVecs, std::vector<float> &queryVecs) {
-
-
-
-    faiss::gpu::StandardGpuResources gpu_res;
-    gpu_res.setDefaultStream(opt.device, raft_handle.get_stream());
-
-    rmm::device_uvector<float> addVecsDev(addVecs.size(), raft_handle.get_stream());
-    raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream());
-
-    rmm::device_uvector<float> queryVecsDev(queryVecs.size(), raft_handle.get_stream());
-    raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream());
-
-    faiss::gpu::GpuDistanceParams args;
-    args.metric          = m;
-    args.k               = opt.k;
-    args.dims            = opt.dim;
-    args.vectors         = addVecs.data();
-    args.vectorsRowMajor = true;
-    args.numVectors      = opt.numAdd;
-    args.queries         = queryVecs.data();
-    args.queriesRowMajor = true;
-    args.numQueries      = opt.numQuery;
-    args.outDistances    = dists;
-    args.outIndices      = inds;
-    args.outIndicesType  = faiss::gpu::IndicesDataType::I64;
-
-    /**
-     * @todo: Until FAISS supports pluggable allocation strategies,
-     * we will not reap the benefits of the pool allocator for
-     * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-     */
-    bfKnn(&gpu_res, args);
-}
-
-void queryTest(
-        faiss::MetricType metricType,
-        bool useFloat16CoarseQuantizer,
-        int dimOverride = -1) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-
-        std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl;
-
-        printf("Creating rmm resources\n");
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFFlatConfig config;
-        config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-        // TODO: Since we are modifying the centroids when adding new vectors,
-        // the neighbors are no longer going to match completely between CPU
-        // and the RAFT indexes. We will probably want to perform a bfknn as
-        // ground truth and then compare the recall for both the RAFT and FAISS
-        // indices.
-
-        printf("Building raft index\n");
-        faiss::gpu::RaftIndexIVFFlat raftIndex(
-                &res, opt.dim, opt.numCentroids, metricType, config);
-
-        printf("Done.\n");
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, opt.dim, opt.numCentroids, metricType, config);
-
-
-        printf("Creating raft handle\n");
-        raft::handle_t raft_handle;
-        printf("Done\n");
-
-        std::cout << "Training raft index" << std::endl;
-        uint32_t r_train_start = raft::curTimeMillis();
-        train_index(raft_handle, opt, raftIndex, trainVecs, addVecs);
-        raft_handle.sync_stream();
-        uint32_t r_train_stop = raft::curTimeMillis();
-        std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl;
-
-        std::cout << "Training gpu index" << std::endl;
-        uint32_t g_train_start = raft::curTimeMillis();
-        train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs);
-        raft_handle.sync_stream();
-        uint32_t g_train_stop = raft::curTimeMillis();
-        std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl;
-
-        std::cout << "Computing ground truth" << std::endl;
-        rmm::device_uvector<faiss::Index::idx_t> ref_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> ref_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs);
-
-        std::cout << "Done." << std::endl;
-        raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout);
-
-        rmm::device_uvector<faiss::Index::idx_t> raft_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> raft_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        uint32_t rstart = raft::curTimeMillis();
-        raftIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                raft_dists.data(),
-                raft_inds.data());
-
-        raft_handle.sync_stream();
-        uint32_t rstop = raft::curTimeMillis();
-        std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl;
-
-        rmm::device_uvector<faiss::Index::idx_t> gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream());
-        rmm::device_uvector<float> gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream());
-
-        uint32_t gstart = raft::curTimeMillis();
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                gpu_dists.data(),
-                gpu_inds.data());
-
-        raft_handle.sync_stream();
-        uint32_t gstop = raft::curTimeMillis();
-
-        std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl;
-
-        // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap.
-
-        raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout);
-        raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout);
-
-//        raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout);
-//        raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout);
-
-//
-//        bool compFloat16 = useFloat16CoarseQuantizer;
-//        faiss::gpu::compareIndices(
-//                cpuIndex,
-//                gpuIndex,
-//                opt.numQuery,
-//                opt.dim,
-//                opt.k,
-//                opt.toString(),
-//                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-//                // FIXME: the fp16 bounds are
-//                // useless when math (the accumulator) is
-//                // in fp16. Figure out another way to test
-//                compFloat16 ? 0.70f : 0.1f,
-//                compFloat16 ? 0.65f : 0.015f);
-    }
-}
-
-void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 quantizerL2(opt.dim);
-        faiss::IndexFlatIP quantizerIP(opt.dim);
-        faiss::Index* quantizer = metricType == faiss::METRIC_L2
-                ? (faiss::Index*)&quantizerL2
-                : (faiss::Index*)&quantizerIP;
-
-        faiss::IndexIVFFlat cpuIndex(
-                quantizer, opt.dim, opt.numCentroids, metricType);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexIVFFlatConfig config;
-        config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
-        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-        faiss::gpu::RaftIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.setNumProbes(opt.nprobe);
-
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
-    }
-}
-
-void copyToTest(bool useFloat16CoarseQuantizer) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-    faiss::gpu::RaftIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.setNumProbes(opt.nprobe);
-
-    // use garbage values to see if we overwrite then
-    faiss::IndexFlatL2 cpuQuantizer(1);
-    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
-    cpuIndex.nprobe = 1;
-
-    gpuIndex.copyTo(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-void copyFromTest(bool useFloat16CoarseQuantizer) {
-    Options opt;
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
-    faiss::IndexIVFFlat cpuIndex(
-            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
-    cpuIndex.nprobe = opt.nprobe;
-    cpuIndex.train(opt.numTrain, trainVecs.data());
-    cpuIndex.add(opt.numAdd, addVecs.data());
-
-    // use garbage values to see if we overwrite then
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-    faiss::gpu::RaftIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(1);
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
-}
-
-//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) {
-//    addTest(faiss::METRIC_L2, false);
-//    printf("Finished addTest(faiss::METRIC_L2, false)\n");
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) {
-//    addTest(faiss::METRIC_INNER_PRODUCT, false);
-//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) {
-//    addTest(faiss::METRIC_L2, true);
-//    printf("Finished addTest(faiss::METRIC_L2, true)\n");
-//}
-//
-//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) {
-//    addTest(faiss::METRIC_INNER_PRODUCT, true);
-//    printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-//}
-
-//
-// General query tests
-//
-
-TEST(TestRaftIndexIVFFlat, Float32_Query_L2) {
-    queryTest(faiss::METRIC_L2, false);
-    printf("Finished queryTest(faiss::METRIC_L2, false);\n");
-}
-
-//TEST(TestRaftIndexIVFFlat, Float32_Query_IP) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n");
-//}
-
-// float16 coarse quantizer
-
-TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) {
-    queryTest(faiss::METRIC_L2, true);
-    printf("Finished queryTest(faiss::METRIC_L2, true)\n");
-}
-
-//TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, true);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n");
-//}
-
-//
-// There are IVF list scanning specializations for 64-d and 128-d that we
-// make sure we explicitly test here
-//
-
-TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) {
-    queryTest(faiss::METRIC_L2, false, 64);
-    printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n");
-}
-
-//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n");
-//}
-
-TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) {
-    queryTest(faiss::METRIC_L2, false, 128);
-    printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n");
-}
-
-//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) {
-//    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
-//    printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n");
-//}
-
-//
-// Copy tests
-//
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) {
-//     copyToTest(false);
-//     printf("Finished copyToTest(false)\n");
-// }
-
-//TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) {
-//    copyFromTest(false);
-//    printf("Finished copyFromTest(false)\n");
-//}
-
-//TEST(TestRaftIndexIVFFlat, Float32_negative) {
-//    Options opt;
-//
-//    auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-//    auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-//
-//    // Put all vecs on negative side
-//    for (auto& f : trainVecs) {
-//        f = std::abs(f) * -1.0f;
-//    }
-//
-//    for (auto& f : addVecs) {
-//        f *= std::abs(f) * -1.0f;
-//    }
-//
-//    faiss::IndexFlatIP quantizerIP(opt.dim);
-//    faiss::Index* quantizer = (faiss::Index*)&quantizerIP;
-//
-//    faiss::IndexIVFFlat cpuIndex(
-//            quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT);
-//    cpuIndex.train(opt.numTrain, trainVecs.data());
-//    cpuIndex.add(opt.numAdd, addVecs.data());
-//    cpuIndex.nprobe = opt.nprobe;
-//
-//    faiss::gpu::StandardGpuResources res;
-//    res.noTempMemory();
-//
-//    faiss::gpu::GpuIndexIVFFlatConfig config;
-//    config.device = opt.device;
-//    config.indicesOptions = opt.indicesOpt;
-//
-//    faiss::gpu::RaftIndexIVFFlat gpuIndex(
-//            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-//    gpuIndex.copyFrom(&cpuIndex);
-//    gpuIndex.setNumProbes(opt.nprobe);
-//
-//    // Construct a positive test set
-//    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-//
-//    // Put all vecs on positive size
-//    for (auto& f : queryVecs) {
-//        f = std::abs(f);
-//    }
-//
-//    bool compFloat16 = false;
-//    faiss::gpu::compareIndices(
-//            queryVecs,
-//            cpuIndex,
-//            gpuIndex,
-//            opt.numQuery,
-//            opt.dim,
-//            opt.k,
-//            opt.toString(),
-//            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-//            // FIXME: the fp16 bounds are
-//            // useless when math (the accumulator) is
-//            // in fp16. Figure out another way to test
-//            compFloat16 ? 0.99f : 0.1f,
-//            compFloat16 ? 0.65f : 0.015f);
-//}
-
-//
-// NaN tests
-//
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFFlat, QueryNaN) {
-//     Options opt;
-
-//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
-//     opt.dim); std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd,
-//     opt.dim);
-
-//     faiss::gpu::StandardGpuResources res;
-//     res.noTempMemory();
-
-//     faiss::gpu::GpuIndexIVFFlatConfig config;
-//     config.device = opt.device;
-//     config.indicesOptions = opt.indicesOpt;
-//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-//     faiss::gpu::RaftIndexIVFFlat gpuIndex(
-//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-//     gpuIndex.setNumProbes(opt.nprobe);
-
-//     gpuIndex.train(opt.numTrain, trainVecs.data());
-//     gpuIndex.add(opt.numAdd, addVecs.data());
-
-//     int numQuery = 10;
-//     std::vector<float> nans(
-//             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-//     std::vector<float> distances(numQuery * opt.k, 0);
-//     std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
-
-//     gpuIndex.search(
-//             numQuery, nans.data(), opt.k, distances.data(), indices.data());
-
-//     for (int q = 0; q < numQuery; ++q) {
-//         for (int k = 0; k < opt.k; ++k) {
-//             EXPECT_EQ(indices[q * opt.k + k], -1);
-//             EXPECT_EQ(
-//                     distances[q * opt.k + k],
-//                     std::numeric_limits<float>::max());
-//         }
-//     }
-// }
-
-/** TODO: test crashes */
-// TEST(TestRaftIndexIVFFlat, AddNaN) {
-//     Options opt;
-
-//     faiss::gpu::StandardGpuResources res;
-//     res.noTempMemory();
-
-//     faiss::gpu::GpuIndexIVFFlatConfig config;
-//     config.device = opt.device;
-//     config.indicesOptions = opt.indicesOpt;
-//     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-//     faiss::gpu::RaftIndexIVFFlat gpuIndex(
-//             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-//     gpuIndex.setNumProbes(opt.nprobe);
-
-//     int numNans = 10;
-//     std::vector<float> nans(
-//             numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
-
-//     // Make one vector valid (not the first vector, in order to test offset
-//     // issues), which should actually add
-//     for (int i = 0; i < opt.dim; ++i) {
-//         nans[opt.dim + i] = i;
-//     }
-
-//     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain,
-//     opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data());
-
-//     // should not crash
-//     EXPECT_EQ(gpuIndex.ntotal, 0);
-//     gpuIndex.add(numNans, nans.data());
-
-//     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery,
-//     opt.dim); std::vector<float> distance(opt.numQuery * opt.k, 0);
-//     std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
-
-//     // should not crash
-//     gpuIndex.search(
-//             opt.numQuery,
-//             queryVecs.data(),
-//             opt.k,
-//             distance.data(),
-//             indices.data());
-// }
-
-//TEST(TestRaftIndexIVFFlat, UnifiedMemory) {
-//    // Construct on a random device to test multi-device, if we have
-//    // multiple devices
-//    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-//
-//    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-//        return;
-//    }
-//
-//    int dim = 128;
-//
-//    int numCentroids = 256;
-//    // Unfortunately it would take forever to add 24 GB in IVFPQ data,
-//    // so just perform a small test with data allocated in the unified
-//    // memory address space
-//    size_t numAdd = 10000;
-//    size_t numTrain = numCentroids * 40;
-//    int numQuery = 10;
-//    int k = 10;
-//    int nprobe = 8;
-//
-//    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
-//    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
-//
-//    faiss::IndexFlatL2 quantizer(dim);
-//    faiss::IndexIVFFlat cpuIndex(
-//            &quantizer, dim, numCentroids, faiss::METRIC_L2);
-//
-//    cpuIndex.train(numTrain, trainVecs.data());
-//    cpuIndex.add(numAdd, addVecs.data());
-//    cpuIndex.nprobe = nprobe;
-//
-//    faiss::gpu::StandardGpuResources res;
-//    res.noTempMemory();
-//
-//    faiss::gpu::GpuIndexIVFFlatConfig config;
-//    config.device = device;
-//    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-//
-//    faiss::gpu::RaftIndexIVFFlat gpuIndex(
-//            &res, dim, numCentroids, faiss::METRIC_L2, config);
-//    gpuIndex.copyFrom(&cpuIndex);
-//    gpuIndex.setNumProbes(nprobe);
-//
-//    faiss::gpu::compareIndices(
-//            cpuIndex,
-//            gpuIndex,
-//            numQuery,
-//            dim,
-//            k,
-//            "Unified Memory",
-//            kF32MaxRelErr,
-//            0.1f,
-//            0.015f);
-//}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}

From f7da008bda856d50537472ee8a35223e0644d873 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Nov 2022 12:27:26 -0500
Subject: [PATCH 43/87] Removing the RaftIndex* files.

---
 faiss/gpu/impl/RaftIndexIVFFlat.cu | 378 -----------------------------
 faiss/gpu/impl/RaftIndexIVFFlat.h  | 112 ---------
 2 files changed, 490 deletions(-)
 delete mode 100644 faiss/gpu/impl/RaftIndexIVFFlat.cu
 delete mode 100644 faiss/gpu/impl/RaftIndexIVFFlat.h

diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu
deleted file mode 100644
index 58d10fcc63..0000000000
--- a/faiss/gpu/impl/RaftIndexIVFFlat.cu
+++ /dev/null
@@ -1,378 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/IndexIVF.h> // for SearchParametersIVF
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/RaftIndexIVFFlat.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-
-#include <raft/core/nvtx.hpp>
-#include <raft/spatial/knn/ivf_flat.cuh>
-
-#include <limits>
-
-namespace faiss {
-namespace gpu {
-
-RaftIndexIVFFlat::RaftIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        const faiss::IndexIVFFlat* index,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVFFlat(provider, index, config),
-          raft_handle(resources_->getDefaultStream(config_.device)) {
-    copyFrom(index);
-}
-
-RaftIndexIVFFlat::RaftIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        int dims,
-        int nlist,
-        faiss::MetricType metric,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVFFlat(provider, dims, nlist, metric, config),
-          raft_handle(resources_->getDefaultStream(config_.device)) {
-
-    std::cout << "In raft index constructor" << std::endl;
-}
-
-
-RaftIndexIVFFlat::RaftIndexIVFFlat(
-        GpuResourcesProvider* provider,
-        Index *coarse_quantizer,
-        int dims,
-        int nlist,
-        faiss::MetricType metric,
-        GpuIndexIVFFlatConfig config)
-        : GpuIndexIVFFlat(provider, coarse_quantizer, dims, nlist, metric, config),
-          raft_handle(resources_->getDefaultStream(config_.device)) {
-
-    std::cout << "In raft index constructor" << std::endl;
-}
-
-
-RaftIndexIVFFlat::~RaftIndexIVFFlat() {
-    RaftIndexIVFFlat::reset();
-}
-
-void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    DeviceScope scope(config_.device);
-    GpuIndex::copyFrom(index);
-    FAISS_ASSERT(index->nlist > 0);
-//    FAISS_THROW_IF_NOT_FMT(
-//            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
-//            "GPU index only supports %zu inverted lists",
-//            (size_t)std::numeric_limits<int>::max());
-//    FAISS_THROW_IF_NOT_FMT(
-//            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
-//            "GPU index only supports nprobe <= %zu; passed %zu",
-//            (size_t)getMaxKSelection(),
-//            index->nprobe);
-
-    /**
-     * TODO: Copy centers and center norms from quantizer
-     * Things to do:
-     *    1. Copy index_->quantizer->vectors_ to raft_index->centers
-     *    2. Copy index_->quantizer->norms_ to raft_index->center_norms
-     */
-    /**
-     * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists
-     *
-     * Things to do:
-     *    1. index->ivflists->data() is going to need to be translated over to our format
-     *       (even the interleaved format is a little different)
-     *
-     *       The GpuIndexIVFFlat has a function translateCodesToGpu_() for this
-     *
-     *    2. We will need to copy  list_sizes, indices, and list_offsets
-     */
-    if (index->is_trained) {
-        // TODO: A proper copy of the index without retraining
-        // For now, just get all the data from the index, and train our index
-        // anew.
-        FAISS_ASSERT(index->d == this->d);
-        FAISS_ASSERT(index->metric_arg == this->metric_arg);
-        FAISS_ASSERT(index->metric_type == this->metric_type);
-        FAISS_ASSERT(index->nlist == this->nlist);
-
-        Index::idx_t quantizer_ntotal = index->quantizer->ntotal;
-        Index::idx_t index_ntotal = index->ntotal;
-
-        std::cout << "Calling copyFrom with trained index with "  << quantizer_ntotal << " items" << std::endl;
-        auto stream = raft_handle.get_stream();
-
-        auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d);
-
-        raft::spatial::knn::ivf_flat::index_params pams;
-
-        switch (this->metric_type) {
-            case faiss::METRIC_L2:
-                pams.metric = raft::distance::DistanceType::L2Expanded;
-                break;
-            case faiss::METRIC_INNER_PRODUCT:
-                pams.metric = raft::distance::DistanceType::InnerProduct;
-                break;
-            default:
-                FAISS_THROW_MSG("Metric is not supported.");
-        }
-
-        raft_knn_index.emplace(raft_handle, pams.metric, false, (uint32_t)this->nlist, (uint32_t)this->d);
-
-        // Copy (reconstructed) centroids over, rather than re-training
-        rmm::device_uvector<float> buf_dev(total_elems, stream);
-        {
-            std::vector<float> buf_host(total_elems);
-            index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-            raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
-        }
-
-        raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
-
-        // Add (reconstructed) vectors to index if needed
-        if(index_ntotal > 0) {
-            std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl;
-            total_elems = size_t(index_ntotal) * size_t(index->d);
-            buf_dev.resize(total_elems, stream);
-            {
-                std::vector<float> buf_host(total_elems);
-                index->reconstruct_n(0, index_ntotal, buf_host.data());
-                raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream);
-            }
-
-            // TODO: We might want to consider moving the centroid norm computation
-            // outside of the incremental add on the RAFT side.
-            RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr);
-        }
-    } else {
-        // index is not trained, so we can remove ours as well (if there was
-        // any)
-        std::cout << "Calling copyFrom with index that hasn't been trained" << std::endl;
-        raft_knn_index.reset();
-    }
-    this->is_trained = index->is_trained;
-}
-
-void RaftIndexIVFFlat::reserveMemory(size_t numVecs) {
-    std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl;
-    reserveMemoryVecs_ = numVecs;
-    if (raft_knn_index.has_value()) {
-        DeviceScope scope(config_.device);
-
-        // TODO: Need to figure out if this is absolutely necessary.
-
-        /**
-         * For example:
-         * raft::spatial::knn::ivf_flat::allocate_ivf_lists(
-         *      raft_handle, *raft_knn_index, numVecs);
-         *
-         * raft::spatial::knn::ivf_flat::populate(
-         *      raft_handle, *raft_knn_index,
-         *      n_centroids, centroids,
-         *      n_vectors, ivf);
-         *
-         */
-    }
-}
-
-size_t RaftIndexIVFFlat::reclaimMemory() {
-    std::cout << "Reclaiming memory" << std::endl;
-
-    // TODO: Need to figure out if this is absolutely necessary
-    /**
-     * For example:
-     * raft::spatial::knn::ivf_flat::reclaim_ivf_lists(
-     *      raft_handle, *raft_knn_index, numVecs);
-     */
-    return 0;
-}
-
-void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) {
-    DeviceScope scope(config_.device);
-
-
-    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
-            "RaftIndexIVFFlat::train (%ld)", n);
-
-    std::cout << "Calling train() with " << n << " rows" << std::endl;
-
-    uint32_t start = raft::curTimeMillis();
-    if (this->is_trained) {
-        FAISS_ASSERT(raft_knn_index.has_value());
-        return;
-    }
-
-    raft::spatial::knn::ivf_flat::index_params raft_idx_params;
-    raft_idx_params.n_lists = nlist;
-    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-    raft_idx_params.add_data_on_build = false;
-    raft_idx_params.kmeans_n_iters = 100;
-
-    raft_knn_index.emplace(
-        raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params,
-                                            const_cast<float*>(x),
-                                            n, (faiss::Index::idx_t)d));
-
-    raft_handle.sync_stream();
-    uint32_t stop = raft::curTimeMillis();
-
-    std::cout << "train took " << (stop - start) << "ms. " << std::endl;
-    this->is_trained = true;
-}
-
-int RaftIndexIVFFlat::getListLength(int listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
-    DeviceScope scope(config_.device);
-
-    uint32_t size;
-    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId,
-               1, raft_handle.get_stream());
-    raft_handle.sync_stream();
-    return int(size);
-}
-
-std::vector<uint8_t> RaftIndexIVFFlat::getListVectorData(
-        int listId,
-        bool gpuFormat) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
-    DeviceScope scope(config_.device);
-
-    std::cout << "Calling getListVectorData for " << listId << std::endl;
-
-    using elem_t = decltype(raft_knn_index.value().data())::element_type;
-    size_t dim = raft_knn_index.value().dim();
-    Index::idx_t offsets[2];
-    raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream());
-
-    raft_handle.sync_stream();
-    size_t byte_offset = offsets[0] * sizeof(elem_t) * dim;
-    // the interleaved block can be slightly larger than the list size (it's
-    // rounded up)
-    size_t byte_size = size_t(offsets[1]) *
-                    sizeof(elem_t) * dim -
-            byte_offset;
-    std::vector<uint8_t> vec(byte_size);
-    raft::copy(
-            vec.data(),
-            reinterpret_cast<const uint8_t*>(raft_knn_index.value().data().data_handle()) +
-                    byte_offset,
-            byte_size,
-            raft_handle.get_stream());
-    return vec;
-}
-
-void RaftIndexIVFFlat::reset() {
-    raft_knn_index.reset();
-    this->ntotal = 0;
-}
-
-std::vector<Index::idx_t> RaftIndexIVFFlat::getListIndices(int listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
-    DeviceScope scope(config_.device);
-
-    Index::idx_t offset;
-    uint32_t size;
-
-    raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream());
-    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
-    raft_handle.sync_stream();
-
-    std::vector<Index::idx_t> vec(size);
-    raft::copy(
-            vec.data(),
-            raft_knn_index.value().indices().data_handle() + offset,
-            size,
-            raft_handle.get_stream());
-    return vec;
-}
-
-void RaftIndexIVFFlat::addImpl_(
-        int n,
-        const float* x,
-        const Index::idx_t* xids) {
-    // Device is already set in GpuIndex::add
-    FAISS_ASSERT(is_trained);
-    FAISS_ASSERT(n > 0);
-
-//    // Not all vectors may be able to be added (some may contain NaNs etc)
-//    index_->addVectors(data, labels);
-
-    // but keep the ntotal based on the total number of vectors that we
-    // attempted to add
-
-    std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl;
-
-    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend(
-            raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n));
-    this->ntotal += n;
-}
-
-void RaftIndexIVFFlat::searchImpl_(
-        int n,
-        const float* x,
-        int k,
-        float* distances,
-        Index::idx_t* labels,
-        const SearchParameters *params) const {
-
-    raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
-            "RaftIndexIVFFlat::searchImpl_ (%ld)", n);
-
-    // Device is already set in GpuIndex::search
-    FAISS_ASSERT(raft_knn_index.has_value());
-    FAISS_ASSERT(n > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist);
-
-    raft::spatial::knn::ivf_flat::search_params pams;
-    pams.n_probes = nprobe;
-    raft::spatial::knn::ivf_flat::search<float, faiss::Index::idx_t>(
-            raft_handle,
-            pams,
-            *raft_knn_index,
-            const_cast<float*>(x),
-            static_cast<std::uint32_t>(n),
-            static_cast<std::uint32_t>(k),
-            labels,
-            distances);
-
-    raft_handle.sync_stream();
-}
-
-void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) {
-    raft::spatial::knn::ivf_flat::index_params pams;
-
-    std::cout << "Calling rebuildRaftIndex with " << n_rows << " rows" << std::endl;
-    pams.n_lists = this->nlist;
-    switch (this->metric_type) {
-        case faiss::METRIC_L2:
-            pams.metric = raft::distance::DistanceType::L2Expanded;
-            break;
-        case faiss::METRIC_INNER_PRODUCT:
-            pams.metric = raft::distance::DistanceType::InnerProduct;
-            break;
-        default:
-            FAISS_THROW_MSG("Metric is not supported.");
-    }
-    pams.metric_arg = this->metric_arg;
-    pams.kmeans_trainset_fraction = 1.0;
-    pams.add_data_on_build = false;
-
-    raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build(
-            this->raft_handle, pams, x, n_rows, uint32_t(this->d)));
-
-    this->raft_handle.sync_stream();
-    this->is_trained = true;
-    this->ntotal = n_rows;
-}
-
-} // namespace gpu
-} // namespace faiss
diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.h b/faiss/gpu/impl/RaftIndexIVFFlat.h
deleted file mode 100644
index eaeabafce6..0000000000
--- a/faiss/gpu/impl/RaftIndexIVFFlat.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <faiss/IndexIVF.h> // for SearchParametersIVF
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-
-#include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ivf_flat_types.hpp>
-
-#include <memory>
-
-namespace faiss {
-struct IndexIVFFlat;
-}
-
-namespace faiss {
-namespace gpu {
-
-/// Wrapper around the GPU implementation that looks like
-/// faiss::gpu::GpuIndexIVFFlat
-class RaftIndexIVFFlat : public GpuIndexIVFFlat {
-   public:
-    /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying
-    /// data over to the given GPU, if the input index is trained.
-    RaftIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            const faiss::IndexIVFFlat* index,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-
-    /// Constructs a new instance with an empty flat quantizer; the user
-    /// provides the number of lists desired.
-    RaftIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            int dims,
-            int nlist,
-            faiss::MetricType metric,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-    /// Constructs a new instance with a provided CPU or GPU coarse quantizer;
-    /// the user provides the number of IVF lists desired.
-    RaftIndexIVFFlat(
-            GpuResourcesProvider* provider,
-            Index* coarseQuantizer,
-            int dims,
-            int nlist,
-            faiss::MetricType metric = faiss::METRIC_L2,
-            GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
-
-    ~RaftIndexIVFFlat() override;
-
-    /// Clears out all inverted lists, but retains the coarse centroid
-    /// information
-    void reset() override;
-
-    /// Trains the coarse quantizer based on the given vector data
-    void train(Index::idx_t n, const float* x) override;
-
-    /// Returns the number of vectors present in a particular inverted list
-    int getListLength(int listId) const override;
-
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
-
-    /// After adding vectors, one can call this to reclaim device memory
-    /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
-
-    void copyFrom(const faiss::IndexIVFFlat* index);
-
-    /// Return the encoded vector data contained in a particular inverted list,
-    /// for debugging purposes.
-    /// If gpuFormat is true, the data is returned as it is encoded in the
-    /// GPU-side representation.
-    /// Otherwise, it is converted to the CPU format.
-    /// compliant format, while the native GPU format may differ.
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
-    const override;
-
-    /// Return the vector indices contained in a particular inverted list, for
-    /// debugging purposes.
-    std::vector<Index::idx_t> getListIndices(int listId) const override;
-
-   protected:
-    /// Called from GpuIndex for add/add_with_ids
-    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
-
-
-    /// Called from GpuIndex for search
-    void searchImpl_(
-            int n,
-            const float* x,
-            int k,
-            float* distances,
-            Index::idx_t* labels,
-            const SearchParameters *params) const override;
-
-    void rebuildRaftIndex(const float* x, Index::idx_t n_rows);
-
-    const raft::handle_t raft_handle;
-    std::optional<raft::spatial::knn::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
-};
-
-} // namespace gpu
-} // namespace faiss

From 5ab762bfa81dec4b908b8d772580bae522903bf5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Nov 2022 16:26:51 -0500
Subject: [PATCH 44/87] Using current raft 22.12

---
 cmake/thirdparty/fetch_rapids.cmake | 2 +-
 cmake/thirdparty/get_raft.cmake     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 0befc2fd5d..69460abe4a 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(RAPIDS_VERSION "22.10")
+set(RAPIDS_VERSION "22.12")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index a7ef8410da..91f53b0f4d 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -16,8 +16,8 @@
 
 
 set(RAFT_VERSION "${RAPIDS_VERSION}")
-set(RAFT_FORK "achirkin")
-set(RAFT_PINNED_TAG "fea-ivf-flat-optional-adaptive-centers")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}")
 
 function(find_and_configure_raft)
     set(oneValueArgs VERSION FORK PINNED_TAG)

From 3684cd327d33b11a40281b46acb89dd8328ff40e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 30 Nov 2022 16:12:01 -0500
Subject: [PATCH 45/87] Checking in a little cleanup

---
 faiss/gpu/impl/RaftIVFFlat.cu | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index be40a65ca6..f20600dc73 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -145,8 +145,7 @@ int RaftIVFFlat::getListLength(int listId) const {
     return int(size);
 }
 
-/// Return the list indices of a par
-/// ticular list back to the CPU
+/// Return the list indices of a particular list back to the CPU
 std::vector<Index::idx_t> RaftIVFFlat::getListIndices(int listId) const {
 
     printf("Inside RaftIVFFlat getListIndices\n");
@@ -221,8 +220,9 @@ void RaftIVFFlat::searchPreassigned(
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     Index::idx_t quantizer_ntotal = quantizer->ntotal;
 
-    std::cout << "Calling updateQuantizer with trained index with "  << quantizer_ntotal << " items" << std::endl;
-    auto stream = resources_->getRaftHandleCurrentDevice().get_stream();
+    std::cout << "Calling RAFT updateQuantizer with trained index with "  << quantizer_ntotal << " items" << std::endl;
+    const raft::handle_t &handle = resources->getRaftHandleCurrentDevice();
+    auto stream = handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
 
@@ -230,16 +230,18 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     switch (this->metric_) {
         case faiss::METRIC_L2:
+            printf("Using L2!\n");
             pams.metric = raft::distance::DistanceType::L2Expanded;
             break;
         case faiss::METRIC_INNER_PRODUCT:
+            printf("Using Inner product!\n");
             pams.metric = raft::distance::DistanceType::InnerProduct;
             break;
         default:
             FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_);
+    raft_knn_index.emplace(handle, pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_);
 
     printf("Reconstructing\n");
     // Copy (reconstructed) centroids over, rather than re-training
@@ -249,9 +251,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     printf("Copying...\n");
 
-    raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
+    auto knn_index = raft_knn_index.value();
 
-    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout);
+    raft::update_device(knn_index.centers().data_handle(), buf_host.data(), total_elems, stream);
+
+    raft::print_device_vector("raft centers", knn_index.centers().data_handle(), this->dim_, std::cout);
 }
 
 

From 35a46b29b7bf29653a2862e0b95dbc53e168e388 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 9 Jan 2023 15:59:22 -0500
Subject: [PATCH 46/87] Disabling raft from pulling in nn dependencies (e.g.
 faiss)

---
 cmake/thirdparty/get_raft.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 91f53b0f4d..1286aee10f 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -39,6 +39,7 @@ function(find_and_configure_raft)
             "BUILD_TESTS OFF"
             "BUILD_BENCH OFF"
             "RAFT_COMPILE_LIBRARIES OFF"
+            "RAFT_ENABLE_NN_DEPENDENCIES OFF"
             )
 endfunction()
 

From e7bf2e5119496045868c31254c6d6124326d4a8c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 11 Jan 2023 18:44:13 -0500
Subject: [PATCH 47/87] Updating raft for 23.02. Still working on failing
 tests.

---
 cmake/thirdparty/fetch_rapids.cmake |  2 +-
 faiss/gpu/GpuIndexIVF.cu            |  2 +-
 faiss/gpu/impl/RaftIVFFlat.cu       | 36 ++++++++++++++---------------
 faiss/gpu/impl/RaftIVFFlat.cuh      | 12 +++++-----
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 69460abe4a..2e14ceac5b 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(RAPIDS_VERSION "22.12")
+set(RAPIDS_VERSION "23.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 116a6844ef..c35667e4a9 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -475,7 +475,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         raft_idx_params.kmeans_n_iters = 100;
 
         auto raft_index = raft::neighbors::ivf_flat::build(
-            raft_handle, raft_idx_params, x, n, (Index::idx_t)d);
+            raft_handle, raft_idx_params, x, n, (idx_t)d);
 
         raft_handle.sync_stream();
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index f20600dc73..483af26521 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -67,7 +67,7 @@ void RaftIVFFlat::search(
         int nprobe,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     printf("Inside RaftIVFFlat search()\n");
 
     // TODO: We probably don't want to ignore the coarse quantizer here...
@@ -86,9 +86,9 @@ void RaftIVFFlat::search(
     pams.n_probes = nprobe;
 
     auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
-    auto out_inds_view = raft::make_device_matrix_view<Index::idx_t>(outIndices.data(), n, k_);
+    auto out_inds_view = raft::make_device_matrix_view<idx_t>(outIndices.data(), n, k_);
     auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
-    raft::neighbors::ivf_flat::search<float, faiss::Index::idx_t>(
+    raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle, *raft_knn_index, queries_view,
             out_inds_view, out_dists_view, pams, k_);
 
@@ -102,11 +102,11 @@ void RaftIVFFlat::search(
 int RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
-        Tensor<Index::idx_t, 1, true>& indices) {
+        Tensor<idx_t, 1, true>& indices) {
     printf("Inside RaftIVFFlat addVectors()\n");
 
-    auto vecs_view = raft::make_device_matrix_view<const float, Index::idx_t>(vecs.data(), vecs.getSize(0), dim_);
-    auto inds_view = raft::make_device_vector_view<const Index::idx_t, Index::idx_t>(indices.data(), (Index::idx_t )indices.getSize(0));
+    auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(vecs.data(), vecs.getSize(0), dim_);
+    auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(indices.data(), (idx_t )indices.getSize(0));
 
     const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
@@ -118,7 +118,7 @@ int RaftIVFFlat::addVectors(
                 raft_handle,
                 raft_knn_index.value(),
                 vecs_view,
-                std::make_optional<raft::device_vector_view<const Index::idx_t, Index::idx_t>>(inds_view)));
+                std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(inds_view)));
 
     } else {
         printf("Index has not been trained!\n");
@@ -146,21 +146,21 @@ int RaftIVFFlat::getListLength(int listId) const {
 }
 
 /// Return the list indices of a particular list back to the CPU
-std::vector<Index::idx_t> RaftIVFFlat::getListIndices(int listId) const {
+std::vector<idx_t> RaftIVFFlat::getListIndices(int listId) const {
 
     printf("Inside RaftIVFFlat getListIndices\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
 
-    Index::idx_t offset;
+    idx_t offset;
     uint32_t size;
 
     raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream());
     raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
     raft_handle.sync_stream();
 
-    std::vector<Index::idx_t> vec(size);
+    std::vector<idx_t> vec(size);
     raft::copy(
             vec.data(),
             raft_knn_index.value().indices().data_handle() + offset,
@@ -181,7 +181,7 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(int listId, bool gpuFormat)
 
     using elem_t = decltype(raft_knn_index.value().data())::element_type;
     size_t dim = raft_knn_index.value().dim();
-    Index::idx_t offsets[2];
+    idx_t offsets[2];
     raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream());
 
     raft_handle.sync_stream();
@@ -207,10 +207,10 @@ void RaftIVFFlat::searchPreassigned(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfDistances,
-        Tensor<Index::idx_t, 2, true>& ivfAssignments,
+        Tensor<idx_t, 2, true>& ivfAssignments,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     printf("Inside RaftIVFFlat searchPreassigned\n");
 
@@ -218,10 +218,10 @@ void RaftIVFFlat::searchPreassigned(
 }
 
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
-    Index::idx_t quantizer_ntotal = quantizer->ntotal;
+    idx_t quantizer_ntotal = quantizer->ntotal;
 
     std::cout << "Calling RAFT updateQuantizer with trained index with "  << quantizer_ntotal << " items" << std::endl;
-    const raft::handle_t &handle = resources->getRaftHandleCurrentDevice();
+    const raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
     auto stream = handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
@@ -251,11 +251,9 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     printf("Copying...\n");
 
-    auto knn_index = raft_knn_index.value();
+    raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
 
-    raft::update_device(knn_index.centers().data_handle(), buf_host.data(), total_elems, stream);
-
-    raft::print_device_vector("raft centers", knn_index.centers().data_handle(), this->dim_, std::cout);
+    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout);
 }
 
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 298a9370c9..968f8fd727 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -43,7 +43,7 @@ class RaftIVFFlat : public IVFFlat {
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices) override;
+            Tensor<idx_t, 2, true>& outIndices) override;
 
     /// Performs search when we are already given the IVF cells to look at
     /// (GpuIndexIVF::search_preassigned implementation)
@@ -51,10 +51,10 @@ class RaftIVFFlat : public IVFFlat {
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfDistances,
-            Tensor<Index::idx_t, 2, true>& ivfAssignments,
+            Tensor<idx_t, 2, true>& ivfAssignments,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
     /// Classify and encode/add vectors to our IVF lists.
@@ -64,7 +64,7 @@ class RaftIVFFlat : public IVFFlat {
     int addVectors(
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
-            Tensor<Index::idx_t, 1, true>& indices) override;
+            Tensor<idx_t, 1, true>& indices) override;
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
@@ -75,7 +75,7 @@ class RaftIVFFlat : public IVFFlat {
     int getListLength(int listId) const override;
 
     /// Return the list indices of a particular list back to the CPU
-    std::vector<Index::idx_t> getListIndices(int listId) const override;
+    std::vector<idx_t> getListIndices(int listId) const override;
 
     /// Return the encoded vectors of a particular list back to the CPU
     std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const override;
@@ -102,7 +102,7 @@ class RaftIVFFlat : public IVFFlat {
 //            size_t numVecs) override;
 
 
-    std::optional<raft::neighbors::ivf_flat::index<float, Index::idx_t>> raft_knn_index{std::nullopt};
+    std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> raft_knn_index{std::nullopt};
 
 };
 

From a8e2ad06a3ba54da32972c09793c0353b6550743 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 11 Jan 2023 20:34:05 -0500
Subject: [PATCH 48/87] Isolating differences in results- it looks like it's
 related to the selection of the probes

---
 faiss/gpu/GpuIndexIVFFlat.cu           |  7 +++++++
 faiss/gpu/impl/RaftIVFFlat.cu          |  7 +++++++
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 11 ++++++++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index b99377cd7e..620a409660 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -166,6 +166,13 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal);
             std::vector<float> buf_host(index->ntotal * index->d);
             index->reconstruct_n(0, index->ntotal, buf_host.data());
+
+            printf("reconstructed vectors: [");
+            for(int i = 0; i < 50; ++i) {
+                printf("%f, ", buf_host[i]);
+            }
+            printf("]\n");
+
             add(index->ntotal, buf_host.data());
         }
     } else {
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 483af26521..0b94b6e5db 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -31,6 +31,8 @@
 #include <limits>
 #include <unordered_map>
 
+#include <raft/core/logger.hpp>
+
 namespace faiss {
 namespace gpu {
 
@@ -105,6 +107,8 @@ int RaftIVFFlat::addVectors(
         Tensor<idx_t, 1, true>& indices) {
     printf("Inside RaftIVFFlat addVectors()\n");
 
+    raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout);
+
     auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(vecs.data(), vecs.getSize(0), dim_);
     auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(indices.data(), (idx_t )indices.getSize(0));
 
@@ -226,7 +230,10 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
 
+    raft::logger::get().set_level(RAFT_LEVEL_TRACE);
+
     raft::spatial::knn::ivf_flat::index_params pams;
+    pams.add_data_on_build = false;
 
     switch (this->metric_) {
         case faiss::METRIC_L2:
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index e026abd85a..6ffd2cceca 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -27,7 +27,7 @@ struct Options {
 
         numCentroids = std::sqrt((float)numAdd / 2);
         numTrain = numCentroids * 40;
-        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
+        nprobe = std::min(numCentroids, int(numCentroids / 2) + 10);//faiss::gpu::randVal(std::min(50, numCentroids), numCentroids);
         numQuery = faiss::gpu::randVal(32, 100);
 
         // Due to the approximate nature of the query and of floating point
@@ -68,6 +68,7 @@ void queryTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer) {
     for (int tries = 0; tries < 2; ++tries) {
+
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -96,8 +97,6 @@ void queryTest(
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
         gpuIndex.copyFrom(&cpuIndex);
-
-
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;
@@ -125,6 +124,12 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
+        printf("original add vectors: [");
+        for(int i = 0; i < 50; ++i) {
+            printf("%f, ", addVecs[i]);
+        }
+        printf("]\n");
+
         faiss::IndexFlatL2 quantizerL2(opt.dim);
         faiss::IndexFlatIP quantizerIP(opt.dim);
         faiss::Index* quantizer = metricType == faiss::METRIC_L2

From f19fd00cc7700ebf4027c01851c0b6e469841149 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 19 Jan 2023 12:24:12 -0500
Subject: [PATCH 49/87] Add and query results appear to match well. LargeBatch
 tests are failing but RAFT seems to have better recall. Still investigating.

---
 faiss/gpu/GpuIndexIVFFlat.cu           | 32 ++++++++++++--------------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |  2 ++
 faiss/gpu/test/TestUtils.cpp           |  4 +---
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 620a409660..45019bb5aa 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -103,6 +103,7 @@ void GpuIndexIVFFlat::set_index_(GpuResources* resources,
                                 IndicesOptions indicesOptions,
                                 MemorySpace space) {
     if(config_.use_raft) {
+        printf("Setting RaftIVFFlat index\n");
         index_.reset(new RaftIVFFlat(
                 resources, dim, nlist, metric, metricArg, useResidual,
                 scalarQ, interleavedLayout, indicesOptions, space));
@@ -139,17 +140,17 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     // The other index might not be trained
     if (!index->is_trained) {
-        FAISS_ASSERT(!this->is_trained);
+        FAISS_ASSERT(!is_trained);
         return;
     }
 
     // Otherwise, we can populate ourselves from the other index
-    FAISS_ASSERT(this->is_trained);
+    FAISS_ASSERT(is_trained);
 
     // Copy our lists as well
     set_index_(resources_.get(),
-            this->d,
-            this->nlist,
+            d,
+            nlist,
             index->metric_type,
             index->metric_arg,
             false,   // no residual
@@ -158,25 +159,22 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
 
-
     if(config_.use_raft) {
 
+        printf("Reconstructing %d original vectors and adding to GPU index\n", ntotal);
+
         // Quantizer should already have been updated above. Add reconstructed vectors to raft index
-        if(index->ntotal > 0) {
-            printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal);
-            std::vector<float> buf_host(index->ntotal * index->d);
-            index->reconstruct_n(0, index->ntotal, buf_host.data());
-
-            printf("reconstructed vectors: [");
-            for(int i = 0; i < 50; ++i) {
-                printf("%f, ", buf_host[i]);
-            }
-            printf("]\n");
-
-            add(index->ntotal, buf_host.data());
+        if(ntotal > 0) {
+            std::vector<float> buf_host(ntotal * d);
+            std::vector<idx_t> ids(ntotal);
+            std::iota(ids.begin(), ids.end(), 0);
+            index->reconstruct_n(0, ntotal, buf_host.data());
+            add_with_ids(ntotal, buf_host.data(), ids.data());
         }
     } else {
+
         // Copy all of the IVF data
+        printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
         index_->copyInvertedListsFrom(index->invlists);
     }
 }
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 6ffd2cceca..a7d1e8f18f 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -100,6 +100,8 @@ void queryTest(
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;
+
+        printf("Use float16: %d\n", compFloat16);
         faiss::gpu::compareIndices(
                 cpuIndex,
                 gpuIndex,
diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp
index 65e36dcc31..a02618c4e4 100644
--- a/faiss/gpu/test/TestUtils.cpp
+++ b/faiss/gpu/test/TestUtils.cpp
@@ -114,9 +114,7 @@ void compareIndices(
             testDistance.data(),
             testIndices.data());
 
-    int idx = 4;
-
-    int start_idx = idx * numQuery;
+    int start_idx = 17 * k;
     int stop_idx = start_idx + k;
     printf("ref inds: [");
     for(int i = start_idx; i < stop_idx; i++) {

From 6269ed1c63a782a4fe657b473331dd9c9d8be423 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 24 Jan 2023 14:58:05 -0500
Subject: [PATCH 50/87] Using facebook for licenses in cmake files

---
 cmake/thirdparty/fetch_rapids.cmake | 15 +++------------
 cmake/thirdparty/get_raft.cmake     | 18 +++---------------
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 2e14ceac5b..f9405b9f22 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -1,16 +1,7 @@
-# =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 set(RAPIDS_VERSION "23.02")
 
diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 1286aee10f..0f12db58ac 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -1,19 +1,7 @@
-#=============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) Facebook, Inc. and its affiliates.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 set(RAFT_VERSION "${RAPIDS_VERSION}")
 set(RAFT_FORK "rapidsai")

From b13593af5aa1fa38cc5089b2de7205ebc0c2ba71 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 24 Jan 2023 14:59:36 -0500
Subject: [PATCH 51/87] Adding small note to build.sh that the file is
 temporary.

---
 build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/build.sh b/build.sh
index 80341ebcfd..5a0c3c58da 100755
--- a/build.sh
+++ b/build.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
 BUILD_TYPE=Release
 BUILD_DIR=build/
 

From bc8885dd33405901e2abb3739e874e3820d9d78b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 31 Jan 2023 12:56:46 -0500
Subject: [PATCH 52/87] Fixing style

---
 faiss/gpu/GpuIndexFlat.cu              |   4 +-
 faiss/gpu/GpuIndexIVF.h                |   5 +-
 faiss/gpu/GpuIndexIVFFlat.h            |  26 ++---
 faiss/gpu/GpuResources.cpp             |   2 +-
 faiss/gpu/GpuResources.h               |   4 +-
 faiss/gpu/StandardGpuResources.h       |   4 +-
 faiss/gpu/impl/RaftFlatIndex.cu        |  66 +++++++-----
 faiss/gpu/impl/RaftFlatIndex.cuh       |   7 +-
 faiss/gpu/impl/RaftIVFFlat.cu          | 144 ++++++++++++++++---------
 faiss/gpu/impl/RaftIVFFlat.cuh         |  45 ++++----
 faiss/gpu/test/TestGpuIndexFlat.cpp    |   4 +-
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |   9 +-
 faiss/gpu/test/TestUtils.cpp           |   9 +-
 13 files changed, 196 insertions(+), 133 deletions(-)

diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index 44ffbe6fce..174005d7e1 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -89,8 +89,7 @@ GpuIndexFlat::GpuIndexFlat(
 GpuIndexFlat::~GpuIndexFlat() {}
 
 void GpuIndexFlat::resetIndex_(int dims) {
-
-    if(config_.use_raft) {
+    if (config_.use_raft) {
         printf("Should use raft!\n");
         data_.reset(new RaftFlatIndex(
                 resources_.get(),
@@ -108,7 +107,6 @@ void GpuIndexFlat::resetIndex_(int dims) {
     }
 }
 
-
 void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
     DeviceScope scope(config_.device);
 
diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index 6c6efbb888..fa4e2e0845 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -86,8 +86,9 @@ class GpuIndexIVF : public GpuIndex {
     /// GPU-side representation.
     /// Otherwise, it is converted to the CPU format.
     /// compliant format, while the native GPU format may differ.
-    virtual std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
-            const;
+    virtual std::vector<uint8_t> getListVectorData(
+            int listId,
+            bool gpuFormat = false) const;
 
     /// Return the vector indices contained in a particular inverted list, for
     /// debugging purposes.
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index a0ca24ec97..5bfdf17a83 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -7,8 +7,8 @@
 
 #pragma once
 
-#include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
 
 #include <memory>
 
@@ -90,18 +90,18 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     void train(idx_t n, const float* x) override;
 
    protected:
-
-    void set_index_(GpuResources* resources,
-                   int dim,
-                   int nlist,
-                   faiss::MetricType metric,
-                   float metricArg,
-                   bool useResidual,
-                    /// Optional ScalarQuantizer
-                   faiss::ScalarQuantizer* scalarQ,
-                   bool interleavedLayout,
-                   IndicesOptions indicesOptions,
-                   MemorySpace space);
+    void set_index_(
+            GpuResources* resources,
+            int dim,
+            int nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            bool useResidual,
+            /// Optional ScalarQuantizer
+            faiss::ScalarQuantizer* scalarQ,
+            bool interleavedLayout,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
 
     /// Our configuration options
     const GpuIndexIVFFlatConfig ivfFlatConfig_;
diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index 0129ddafd4..970407e828 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -153,7 +153,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
     return getDefaultStream(getCurrentDevice());
 }
 
-raft::handle_t &GpuResources::getRaftHandleCurrentDevice() {
+raft::handle_t& GpuResources::getRaftHandleCurrentDevice() {
     return getRaftHandle(getCurrentDevice());
 }
 
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index c286fbae82..d5a4939136 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -194,8 +194,8 @@ class GpuResources {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    virtual raft::handle_t &getRaftHandle(int device) = 0;
-    raft::handle_t &getRaftHandleCurrentDevice();
+    virtual raft::handle_t& getRaftHandle(int device) = 0;
+    raft::handle_t& getRaftHandleCurrentDevice();
 
     /// Overrides the default stream for a device to the user-supplied stream.
     /// The resources object does not own this stream (i.e., it will not destroy
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 672f1b8339..ad2b371476 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -62,7 +62,7 @@ class StandardGpuResourcesImpl : public GpuResources {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    raft::handle_t &getRaftHandle(int device) override;
+    raft::handle_t& getRaftHandle(int device) override;
 
     /// Called to change the work ordering streams to the null stream
     /// for all devices
@@ -197,7 +197,7 @@ class StandardGpuResources : public GpuResourcesProvider {
 
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
-    raft::handle_t &getRaftHandle(int device);
+    raft::handle_t& getRaftHandle(int device);
 
     /// Returns the current amount of temp memory available
     size_t getTempMemoryAvailable(int device) const;
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index f0283e2a00..b1254a96a3 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -32,38 +32,54 @@ void RaftFlatIndex::query(
         Tensor<float, 2, true>& outDistances,
         Tensor<int, 2, true>& outIndices,
         bool exactDistance) {
-
     // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used
-    if(k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) {
-        raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    if (k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) {
+        raft::handle_t& raft_handle = resources_->getRaftHandleCurrentDevice();
 
-        auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded :
-                                      raft::distance::DistanceType::L2Expanded;
+        auto distance = exactDistance
+                ? raft::distance::DistanceType::L2Unexpanded
+                : raft::distance::DistanceType::L2Expanded;
 
-        auto index = raft::make_device_matrix_view<float>(vectors_.data(), vectors_.getSize(0), vectors_.getSize(1));
-        auto search = raft::make_device_matrix_view<float>(input.data(), input.getSize(0), input.getSize(1));
-        auto inds = raft::make_device_matrix_view<int>(outIndices.data(), outIndices.getSize(0), outIndices.getSize(1));
-        auto dists = raft::make_device_matrix_view<float>(outDistances.data(), outDistances.getSize(0), outDistances.getSize(1));
+        auto index = raft::make_device_matrix_view<float>(
+                vectors_.data(), vectors_.getSize(0), vectors_.getSize(1));
+        auto search = raft::make_device_matrix_view<float>(
+                input.data(), input.getSize(0), input.getSize(1));
+        auto inds = raft::make_device_matrix_view<int>(
+                outIndices.data(),
+                outIndices.getSize(0),
+                outIndices.getSize(1));
+        auto dists = raft::make_device_matrix_view<float>(
+                outDistances.data(),
+                outDistances.getSize(0),
+                outDistances.getSize(1));
 
-//        raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance);
+        //        raft::neighbors::brute_force::knn(raft_handle, index, search,
+        //        inds, dists, k, distance);
 
         // TODO: Expose the fused L2KNN through RAFT's public APIs
-        raft::spatial::knn::detail::fusedL2Knn(dim_,
-                   inds.data_handle(),
-                   dists.data_handle(),
-                   index.data_handle(),
-                   search.data_handle(),
-                   index.extent(0),
-                   search.extent(0),
-                   k,
-                   true,
-                   true,
-                   raft_handle.get_stream(),
-                   distance);
-
-        } else {
+        raft::spatial::knn::detail::fusedL2Knn(
+                dim_,
+                inds.data_handle(),
+                dists.data_handle(),
+                index.data_handle(),
+                search.data_handle(),
+                index.extent(0),
+                search.extent(0),
+                k,
+                true,
+                true,
+                raft_handle.get_stream(),
+                distance);
 
-        FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance);
+    } else {
+        FlatIndex::query(
+                input,
+                k,
+                metric,
+                metricArg,
+                outDistances,
+                outIndices,
+                exactDistance);
     }
 }
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index ad48102254..ed4f2572e0 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -25,7 +25,11 @@ class GpuResources;
 /// data is possibly needed for certain residual operations
 class RaftFlatIndex : public FlatIndex {
    public:
-    RaftFlatIndex(GpuResources* res, int dim, bool useFloat16, MemorySpace space);
+    RaftFlatIndex(
+            GpuResources* res,
+            int dim,
+            bool useFloat16,
+            MemorySpace space);
 
     void query(
             Tensor<float, 2, true>& vecs,
@@ -35,7 +39,6 @@ class RaftFlatIndex : public FlatIndex {
             Tensor<float, 2, true>& outDistances,
             Tensor<int, 2, true>& outIndices,
             bool exactDistance) override;
-
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 0b94b6e5db..cbe3e58a7c 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -5,9 +5,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cstdint>
-#include <raft/core/handle.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <cstdint>
 #include <raft/neighbors/ivf_flat.cuh>
 
 #include <faiss/gpu/GpuIndex.h>
@@ -16,12 +16,12 @@
 #include <faiss/gpu/impl/RemapIndices.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <thrust/host_vector.h>
-#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFAppend.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/impl/IVFFlatScan.cuh>
 #include <faiss/gpu/impl/IVFInterleaved.cuh>
+#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceDefs.cuh>
@@ -56,11 +56,10 @@ RaftIVFFlat::RaftIVFFlat(
                   scalarQ,
                   interleavedLayout,
                   indicesOptions,
-                  space){}
+                  space) {}
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
-
 /// Find the approximate k nearest neighbors for `queries` against
 /// our database
 void RaftIVFFlat::search(
@@ -83,16 +82,25 @@ void RaftIVFFlat::search(
     FAISS_ASSERT(n > 0);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
 
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::handle_t& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
     raft::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
-    auto queries_view = raft::make_device_matrix_view<const float>(queries.data(), n, cols);
-    auto out_inds_view = raft::make_device_matrix_view<idx_t>(outIndices.data(), n, k_);
-    auto out_dists_view = raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
+    auto queries_view =
+            raft::make_device_matrix_view<const float>(queries.data(), n, cols);
+    auto out_inds_view =
+            raft::make_device_matrix_view<idx_t>(outIndices.data(), n, k_);
+    auto out_dists_view =
+            raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
     raft::neighbors::ivf_flat::search<float, idx_t>(
-            raft_handle, *raft_knn_index, queries_view,
-            out_inds_view, out_dists_view, pams, k_);
+            raft_handle,
+            *raft_knn_index,
+            queries_view,
+            out_inds_view,
+            out_dists_view,
+            pams,
+            k_);
 
     raft_handle.sync_stream();
 }
@@ -109,20 +117,25 @@ int RaftIVFFlat::addVectors(
 
     raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout);
 
-    auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(vecs.data(), vecs.getSize(0), dim_);
-    auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(indices.data(), (idx_t )indices.getSize(0));
+    auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(
+            vecs.data(), vecs.getSize(0), dim_);
+    auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(
+            indices.data(), (idx_t)indices.getSize(0));
 
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::handle_t& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     printf("About to call extend on index\n");
     // TODO: We probably don't want to ignore the coarse quantizer here
 
-    if(raft_knn_index.has_value()) {
+    if (raft_knn_index.has_value()) {
         raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
                 raft_handle,
                 raft_knn_index.value(),
                 vecs_view,
-                std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(inds_view)));
+                std::make_optional<
+                        raft::device_vector_view<const idx_t, idx_t>>(
+                        inds_view)));
 
     } else {
         printf("Index has not been trained!\n");
@@ -140,28 +153,40 @@ int RaftIVFFlat::getListLength(int listId) const {
     printf("Inside RaftIVFFlat getListLength\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::handle_t& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     uint32_t size;
-    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId,
-               1, raft_handle.get_stream());
+    raft::copy(
+            &size,
+            raft_knn_index.value().list_sizes().data_handle() + listId,
+            1,
+            raft_handle.get_stream());
     raft_handle.sync_stream();
     return int(size);
 }
 
 /// Return the list indices of a particular list back to the CPU
 std::vector<idx_t> RaftIVFFlat::getListIndices(int listId) const {
-
     printf("Inside RaftIVFFlat getListIndices\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::handle_t& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     idx_t offset;
     uint32_t size;
 
-    raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream());
-    raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
+    raft::copy(
+            &offset,
+            raft_knn_index.value().list_offsets().data_handle() + listId,
+            1,
+            raft_handle.get_stream());
+    raft::copy(
+            &size,
+            raft_knn_index.value().list_sizes().data_handle() + listId,
+            1,
+            raft_handle.get_stream());
     raft_handle.sync_stream();
 
     std::vector<idx_t> vec(size);
@@ -174,32 +199,36 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(int listId) const {
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) const {
-
+std::vector<uint8_t> RaftIVFFlat::getListVectorData(int listId, bool gpuFormat)
+        const {
     printf("Inside RaftIVFFlat getListVectorData\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::handle_t& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     std::cout << "Calling getListVectorData for " << listId << std::endl;
 
     using elem_t = decltype(raft_knn_index.value().data())::element_type;
     size_t dim = raft_knn_index.value().dim();
     idx_t offsets[2];
-    raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream());
+    raft::copy(
+            offsets,
+            raft_knn_index.value().list_offsets().data_handle() + listId,
+            2,
+            raft_handle.get_stream());
 
     raft_handle.sync_stream();
     size_t byte_offset = offsets[0] * sizeof(elem_t) * dim;
     // the interleaved block can be slightly larger than the list size (it's
     // rounded up)
-    size_t byte_size = size_t(offsets[1]) *
-                       sizeof(elem_t) * dim -
-                       byte_offset;
+    size_t byte_size = size_t(offsets[1]) * sizeof(elem_t) * dim - byte_offset;
     std::vector<uint8_t> vec(byte_size);
     raft::copy(
             vec.data(),
-            reinterpret_cast<const uint8_t*>(raft_knn_index.value().data().data_handle()) +
-            byte_offset,
+            reinterpret_cast<const uint8_t*>(
+                    raft_knn_index.value().data().data_handle()) +
+                    byte_offset,
             byte_size,
             raft_handle.get_stream());
     return vec;
@@ -224,8 +253,9 @@ void RaftIVFFlat::searchPreassigned(
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     idx_t quantizer_ntotal = quantizer->ntotal;
 
-    std::cout << "Calling RAFT updateQuantizer with trained index with "  << quantizer_ntotal << " items" << std::endl;
-    const raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
+    std::cout << "Calling RAFT updateQuantizer with trained index with "
+              << quantizer_ntotal << " items" << std::endl;
+    const raft::handle_t& handle = resources_->getRaftHandleCurrentDevice();
     auto stream = handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
@@ -248,7 +278,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft_knn_index.emplace(handle, pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_);
+    raft_knn_index.emplace(
+            handle,
+            pams.metric,
+            (uint32_t)this->numLists_,
+            false,
+            (uint32_t)this->dim_);
 
     printf("Reconstructing\n");
     // Copy (reconstructed) centroids over, rather than re-training
@@ -258,15 +293,22 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     printf("Copying...\n");
 
-    raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream);
-
-    raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout);
+    raft::update_device(
+            raft_knn_index.value().centers().data_handle(),
+            buf_host.data(),
+            total_elems,
+            stream);
+
+    raft::print_device_vector(
+            "raft centers",
+            raft_knn_index.value().centers().data_handle(),
+            this->dim_,
+            std::cout);
 }
 
-
 //
 //
-//void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+// void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
 //    size_t nlist = ivf ? ivf->nlist : 0;
 //    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 //
@@ -303,12 +345,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 //                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
 //    }
 //
-//    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream());
-//    raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream());
+//    raft::update_device(raft_knn_index.value().list_sizes().data_handle(),
+//    list_sizes_.data(), nlist, handle.get_stream());
+//    raft::update_device(raft_knn_index.value().list_offsets().data_handle(),
+//    list_offsets_.data(), nlist+1, handle.get_stream());
 //
 //}
 
-//void RaftIVFFlat::addEncodedVectorsToList_(
+// void RaftIVFFlat::addEncodedVectorsToList_(
 //        int listId,
 //        const void* codes,
 //        const Index::idx_t* indices,
@@ -334,16 +378,20 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 //
 //    // We only have int32 length representations on the GPU per each
 //    // list; the length is in sizeof(char)
-//    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+//    FAISS_ASSERT(gpuListSizeInBytes <=
+//    (size_t)std::numeric_limits<int>::max());
 //
 //    // Translate the codes as needed to our preferred form
 //    std::vector<uint8_t> codesV(cpuListSizeInBytes);
 //    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
 //    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
 //
-//    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl;
+//    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" <<
+//    gpuListSizeInBytes << std::endl;
 //
-////    RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), ))
+////
+///RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(),
+///translatedCodes.data(), ))
 //
 ////    listCodes->data.append(
 ////            translatedCodes.data(),
@@ -365,14 +413,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 ////    maxListLength_ = std::max(maxListLength_, (int)numVecs);
 //}
 
-
 ///// Copy all inverted lists from ourselves to a CPU representation
-//void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
+// void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
 //    printf("Inside RaftIVFFlat copyInvertedListsTo\n");
 //
 //    // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu
 //}
 
-
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 968f8fd727..13bd0544e7 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -20,7 +20,8 @@ namespace gpu {
 
 class RaftIVFFlat : public IVFFlat {
    public:
-    RaftIVFFlat(GpuResources* resources,
+    RaftIVFFlat(
+            GpuResources* resources,
             int dim,
             int nlist,
             faiss::MetricType metric,
@@ -34,7 +35,6 @@ class RaftIVFFlat : public IVFFlat {
 
     ~RaftIVFFlat() override;
 
-
     /// Find the approximate k nearest neigbors for `queries` against
     /// our database
     void search(
@@ -78,32 +78,31 @@ class RaftIVFFlat : public IVFFlat {
     std::vector<idx_t> getListIndices(int listId) const override;
 
     /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const override;
+    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat)
+            const override;
 
     void updateQuantizer(Index* quantizer) override;
 
-//
-//    /// Copy all inverted lists from a CPU representation to ourselves
-//    void copyInvertedListsFrom(const InvertedLists* ivf) override;
-//
-//    /// Copy all inverted lists from ourselves to a CPU representation
-//    void copyInvertedListsTo(InvertedLists* ivf) override;
+    //
+    //    /// Copy all inverted lists from a CPU representation to ourselves
+    //    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+    //
+    //    /// Copy all inverted lists from ourselves to a CPU representation
+    //    void copyInvertedListsTo(InvertedLists* ivf) override;
 
    protected:
-
-//    /// Adds a set of codes and indices to a list, with the representation
-//    /// coming from the CPU equivalent
-//    void addEncodedVectorsToList_(
-//            int listId,
-//            // resident on the host
-//            const void* codes,
-//            // resident on the host
-//            const Index::idx_t* indices,
-//            size_t numVecs) override;
-
-
-    std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> raft_knn_index{std::nullopt};
-
+    //    /// Adds a set of codes and indices to a list, with the representation
+    //    /// coming from the CPU equivalent
+    //    void addEncodedVectorsToList_(
+    //            int listId,
+    //            // resident on the host
+    //            const void* codes,
+    //            // resident on the host
+    //            const Index::idx_t* indices,
+    //            size_t numVecs) override;
+
+    std::optional<raft::neighbors::ivf_flat::index<float, idx_t>>
+            raft_knn_index{std::nullopt};
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 6c806c7881..7e760551e7 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -266,7 +266,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
     for (bool useFloat16 : {false, true}) {
         faiss::gpu::GpuIndexFlatConfig config;
         config.device = device;
-	config.use_raft = true;
+        config.use_raft = true;
         config.useFloat16 = useFloat16;
 
         // Fill with garbage values
@@ -309,7 +309,7 @@ TEST(TestGpuIndexFlat, CopyTo) {
     for (bool useFloat16 : {false, true}) {
         faiss::gpu::GpuIndexFlatConfig config;
         config.device = device;
-	config.use_raft = true;
+        config.use_raft = true;
         config.useFloat16 = useFloat16;
 
         faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index a7d1e8f18f..6b98743d90 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -27,7 +27,10 @@ struct Options {
 
         numCentroids = std::sqrt((float)numAdd / 2);
         numTrain = numCentroids * 40;
-        nprobe = std::min(numCentroids, int(numCentroids / 2) + 10);//faiss::gpu::randVal(std::min(50, numCentroids), numCentroids);
+        nprobe = std::min(
+                numCentroids,
+                int(numCentroids / 2) + 10); // faiss::gpu::randVal(std::min(50,
+                                             // numCentroids), numCentroids);
         numQuery = faiss::gpu::randVal(32, 100);
 
         // Due to the approximate nature of the query and of floating point
@@ -68,7 +71,6 @@ void queryTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer) {
     for (int tries = 0; tries < 2; ++tries) {
-
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -127,7 +129,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
         printf("original add vectors: [");
-        for(int i = 0; i < 50; ++i) {
+        for (int i = 0; i < 50; ++i) {
             printf("%f, ", addVecs[i]);
         }
         printf("]\n");
@@ -187,7 +189,6 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
-
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.train(opt.numTrain, trainVecs.data());
diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp
index a02618c4e4..04f136782c 100644
--- a/faiss/gpu/test/TestUtils.cpp
+++ b/faiss/gpu/test/TestUtils.cpp
@@ -117,30 +117,29 @@ void compareIndices(
     int start_idx = 17 * k;
     int stop_idx = start_idx + k;
     printf("ref inds: [");
-    for(int i = start_idx; i < stop_idx; i++) {
+    for (int i = start_idx; i < stop_idx; i++) {
         printf("%d, ", int(refIndices[i]));
     }
     printf("]\n");
 
     printf("test inds: [");
-    for(int i = start_idx; i < stop_idx; i++) {
+    for (int i = start_idx; i < stop_idx; i++) {
         printf("%d, ", int(testIndices[i]));
     }
     printf("]\n");
 
     printf("ref dists: [");
-    for(int i = start_idx; i < stop_idx; i++) {
+    for (int i = start_idx; i < stop_idx; i++) {
         printf("%f, ", float(refDistance[i]));
     }
     printf("]\n");
 
     printf("test dists: [");
-    for(int i = start_idx; i < stop_idx; i++) {
+    for (int i = start_idx; i < stop_idx; i++) {
         printf("%f, ", float(testDistance[i]));
     }
     printf("]\n");
 
-
     faiss::gpu::compareLists(
             refDistance.data(),
             refIndices.data(),

From 19f38d4557b264a3f7856eed5b97d486afe5c746 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 31 Jan 2023 13:04:29 -0500
Subject: [PATCH 53/87] Second pass of fixing formatting

---
 faiss/gpu/GpuIndexFlat.h           |   1 -
 faiss/gpu/GpuIndexIVF.cu           |   8 +--
 faiss/gpu/GpuIndexIVFFlat.cu       | 102 +++++++++++++++++------------
 faiss/gpu/StandardGpuResources.cpp |   5 +-
 faiss/gpu/impl/IVFBase.cuh         |   3 +-
 faiss/gpu/impl/RaftIVFFlat.cu      |   4 +-
 6 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h
index 084bacd4b6..a3c177040b 100644
--- a/faiss/gpu/GpuIndexFlat.h
+++ b/faiss/gpu/GpuIndexFlat.h
@@ -115,7 +115,6 @@ class GpuIndexFlat : public GpuIndex {
     }
 
    protected:
-
     void resetIndex_(int dims);
 
     /// Flat index does not require IDs as there is no storage available for
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index c35667e4a9..6181ba5401 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -463,10 +463,10 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
     }
 
-    if(config_.use_raft) {
-
+    if (config_.use_raft) {
         printf("Using raft to train quantizer for %d vectors\n", n);
-        const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice();
+        const raft::handle_t& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
 
         raft::neighbors::ivf_flat::index_params raft_idx_params;
         raft_idx_params.n_lists = nlist;
@@ -475,7 +475,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         raft_idx_params.kmeans_n_iters = 100;
 
         auto raft_index = raft::neighbors::ivf_flat::build(
-            raft_handle, raft_idx_params, x, n, (idx_t)d);
+                raft_handle, raft_idx_params, x, n, (idx_t)d);
 
         raft_handle.sync_stream();
 
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 45019bb5aa..7c152cc420 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -5,16 +5,16 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <raft/core/cudart_utils.hpp>
-#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <raft/core/cudart_utils.hpp>
 #include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
@@ -74,7 +74,8 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
 
     if (this->is_trained) {
         FAISS_ASSERT(this->quantizer);
-        set_index_(resources_.get(),
+        set_index_(
+                resources_.get(),
                 this->d,
                 this->nlist,
                 this->metric_type,
@@ -91,26 +92,43 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
 
 GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
 
-void GpuIndexIVFFlat::set_index_(GpuResources* resources,
-                                int dim,
-                                int nlist,
-                                faiss::MetricType metric,
-                                float metricArg,
-                                bool useResidual,
-                                /// Optional ScalarQuantizer
-                                faiss::ScalarQuantizer* scalarQ,
-                                bool interleavedLayout,
-                                IndicesOptions indicesOptions,
-                                MemorySpace space) {
-    if(config_.use_raft) {
+void GpuIndexIVFFlat::set_index_(
+        GpuResources* resources,
+        int dim,
+        int nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        /// Optional ScalarQuantizer
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space) {
+    if (config_.use_raft) {
         printf("Setting RaftIVFFlat index\n");
         index_.reset(new RaftIVFFlat(
-                resources, dim, nlist, metric, metricArg, useResidual,
-                scalarQ, interleavedLayout, indicesOptions, space));
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
     } else {
         index_.reset(new IVFFlat(
-                resources, dim, nlist, metric, metricArg, useResidual,
-                scalarQ, interleavedLayout, indicesOptions, space));
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
     }
 
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
@@ -127,7 +145,6 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 }
 
 void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-
     printf("Inside copyFrom\n");
     DeviceScope scope(config_.device);
 
@@ -148,7 +165,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     FAISS_ASSERT(is_trained);
 
     // Copy our lists as well
-    set_index_(resources_.get(),
+    set_index_(
+            resources_.get(),
             d,
             nlist,
             index->metric_type,
@@ -159,12 +177,13 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
 
-    if(config_.use_raft) {
+    if (config_.use_raft) {
+        printf("Reconstructing %d original vectors and adding to GPU index\n",
+               ntotal);
 
-        printf("Reconstructing %d original vectors and adding to GPU index\n", ntotal);
-
-        // Quantizer should already have been updated above. Add reconstructed vectors to raft index
-        if(ntotal > 0) {
+        // Quantizer should already have been updated above. Add reconstructed
+        // vectors to raft index
+        if (ntotal > 0) {
             std::vector<float> buf_host(ntotal * d);
             std::vector<idx_t> ids(ntotal);
             std::iota(ids.begin(), ids.end(), 0);
@@ -172,7 +191,6 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             add_with_ids(ntotal, buf_host.data(), ids.data());
         }
     } else {
-
         // Copy all of the IVF data
         printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
         index_->copyInvertedListsFrom(index->invlists);
@@ -251,7 +269,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     FAISS_ASSERT(!index_);
 
     // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on the CPU, as we depend upon parts of the CPU code
+    // First, make sure that the data is resident on the CPU, if it is not on
+    // the CPU, as we depend upon parts of the CPU code
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
@@ -260,18 +279,19 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     trainQuantizer_(n, hostData.data());
 
     // The quantizer is now trained; construct the IVF index
-    set_index_(resources_.get(),
-              this->d,
-              this->nlist,
-              this->metric_type,
-              this->metric_arg,
-              false,   // no residual
-              nullptr, // no scalar quantizer
-              ivfFlatConfig_.interleavedLayout,
-              ivfFlatConfig_.indicesOptions,
-              config_.memorySpace);
-
-     if (reserveMemoryVecs_) {
+    set_index_(
+            resources_.get(),
+            this->d,
+            this->nlist,
+            this->metric_type,
+            this->metric_arg,
+            false,   // no residual
+            nullptr, // no scalar quantizer
+            ivfFlatConfig_.interleavedLayout,
+            ivfFlatConfig_.indicesOptions,
+            config_.memorySpace);
+
+    if (reserveMemoryVecs_) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
 
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index c593264ab0..af6c9579b9 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -379,7 +379,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) {
+raft::handle_t& StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
     auto it = raftHandles_.find(device);
@@ -390,7 +390,6 @@ raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) {
 
     // Otherwise, our base default handle
     return raftHandles_[device];
-
 }
 
 std::vector<cudaStream_t> StandardGpuResourcesImpl::getAlternateStreams(
@@ -618,7 +617,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
- raft::handle_t &StandardGpuResources::getRaftHandle(int device) {
+raft::handle_t& StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }
 
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 537120f769..24e6fb708c 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -65,7 +65,8 @@ class IVFBase {
     virtual std::vector<idx_t> getListIndices(int listId) const;
 
     /// Return the encoded vectors of a particular list back to the CPU
-    virtual std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const;
+    virtual std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat)
+            const;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     virtual void copyInvertedListsFrom(const InvertedLists* ivf);
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index cbe3e58a7c..5d7268497b 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -390,8 +390,8 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 //    gpuListSizeInBytes << std::endl;
 //
 ////
-///RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(),
-///translatedCodes.data(), ))
+/// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(),
+/// translatedCodes.data(), ))
 //
 ////    listCodes->data.append(
 ////            translatedCodes.data(),

From 17df7989221aa24a4c97f730f0bdd8edb480e45b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 31 Jan 2023 13:06:08 -0500
Subject: [PATCH 54/87] Third pass at fixing format style

---
 faiss/gpu/impl/IVFFlat.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 4c8ca19552..8997039580 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
         int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
 
         // bytes to encode a block of 32 vectors (single dimension)
-        int bytesPerDimBlock = bits * 32 / 8;  // = 128 if bits == 32
+        int bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32
 
         // bytes to fully encode 32 vectors
         int bytesPerBlock = bytesPerDimBlock * dim_;
@@ -93,7 +93,9 @@ std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
 
     bool sc = scalarQ_ ? true : false;
     int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-    std::cout << "dim_=" << dim_ << ", scalarQ_=" <<  sc << ", bitsPerCode=" << bitsPerCode << ", interleavedLayout_=" << interleavedLayout_ << std::endl;
+    std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc
+              << ", bitsPerCode=" << bitsPerCode
+              << ", interleavedLayout_=" << interleavedLayout_ << std::endl;
 
     auto up =
             unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
@@ -114,7 +116,6 @@ std::vector<uint8_t> IVFFlat::translateCodesFromGpu_(
     return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
 }
 
-
 void IVFFlat::appendVectors_(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfCentroidResiduals,
@@ -199,7 +200,6 @@ void IVFFlat::search(
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe, dim_});
 
-
     searchCoarseQuantizer_(
             coarseQuantizer,
             nprobe,

From 29934410464ecc0d38a8c2e19ea044dcf3a88cb4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 31 Jan 2023 13:23:37 -0500
Subject: [PATCH 55/87] Adding nvidia license for traceability

---
 faiss/gpu/GpuIndexIVFFlat.cu       | 15 +++++++++++++++
 faiss/gpu/GpuIndexIVFFlat.h        | 15 +++++++++++++++
 faiss/gpu/GpuResources.cpp         | 15 +++++++++++++++
 faiss/gpu/GpuResources.h           | 15 +++++++++++++++
 faiss/gpu/StandardGpuResources.cpp | 15 +++++++++++++++
 faiss/gpu/StandardGpuResources.h   | 15 +++++++++++++++
 faiss/gpu/impl/RaftFlatIndex.cu    | 15 +++++++++++++++
 faiss/gpu/impl/RaftFlatIndex.cuh   | 15 +++++++++++++++
 faiss/gpu/impl/RaftIVFFlat.cu      | 16 +++++++++++++++-
 faiss/gpu/impl/RaftIVFFlat.cuh     | 15 +++++++++++++++
 10 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 7c152cc420..f2122d58bf 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index 5bfdf17a83..7b5adf58f7 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index 970407e828..cc74f2df10 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index d5a4939136..dfec38796d 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index af6c9579b9..7e620c7aec 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <raft/core/handle.hpp>
 
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index ad2b371476..490734addc 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index b1254a96a3..ac9594bdb7 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/impl/Distance.cuh>
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index ed4f2572e0..d947f9b21d 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 5d7268497b..3820beb2b1 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -4,7 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
-
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
 #include <cstdint>
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 13bd0544e7..d0c17fd2e9 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 

From 5e7eb6dccf2b505721d1be1f8bf162984b1d3b95 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 6 Feb 2023 12:07:09 -0500
Subject: [PATCH 56/87] Updates

---
 faiss/gpu/GpuIndexIVF.cu               | 5 ++---
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 8 ++------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 6181ba5401..4b4197d556 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -459,9 +459,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         return;
     }
 
-    if (this->verbose) {
-        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
-    }
+    printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
 
     if (config_.use_raft) {
         printf("Using raft to train quantizer for %d vectors\n", n);
@@ -472,6 +470,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         raft_idx_params.n_lists = nlist;
         raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
         raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_trainset_fraction = 1.0;
         raft_idx_params.kmeans_n_iters = 100;
 
         auto raft_index = raft::neighbors::ivf_flat::build(
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 6b98743d90..d03a7947b2 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -20,6 +20,7 @@
 constexpr float kF16MaxRelErr = 0.3f;
 constexpr float kF32MaxRelErr = 0.03f;
 
+
 struct Options {
     Options() {
         numAdd = 2 * faiss::gpu::randVal(2000, 5000);
@@ -27,10 +28,7 @@ struct Options {
 
         numCentroids = std::sqrt((float)numAdd / 2);
         numTrain = numCentroids * 40;
-        nprobe = std::min(
-                numCentroids,
-                int(numCentroids / 2) + 10); // faiss::gpu::randVal(std::min(50,
-                                             // numCentroids), numCentroids);
+        nprobe = faiss::gpu::randVal(std::min(50, numCentroids), numCentroids);
         numQuery = faiss::gpu::randVal(32, 100);
 
         // Due to the approximate nature of the query and of floating point
@@ -102,8 +100,6 @@ void queryTest(
         gpuIndex.setNumProbes(opt.nprobe);
 
         bool compFloat16 = useFloat16CoarseQuantizer;
-
-        printf("Use float16: %d\n", compFloat16);
         faiss::gpu::compareIndices(
                 cpuIndex,
                 gpuIndex,

From ddc75ac77c89a4a3cd6f5e4198d4068dc2a7775a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 7 Feb 2023 16:16:24 -0500
Subject: [PATCH 57/87] Merging

---
 faiss/gpu/impl/IVFBase.cuh       | 2 +-
 faiss/gpu/impl/RaftFlatIndex.cu  | 4 ++--
 faiss/gpu/impl/RaftFlatIndex.cuh | 2 +-
 faiss/gpu/impl/RaftIVFFlat.cu    | 8 ++++----
 faiss/gpu/impl/RaftIVFFlat.cuh   | 8 ++++----
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 9fdaace261..2bb319d002 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -59,7 +59,7 @@ class IVFBase {
 
     /// For debugging purposes, return the list length of a particular
     /// list
-    virtual idx_t getListLength(int listId) const;
+    virtual idx_t getListLength(idx_t listId) const;
 
     /// Return the list indices of a particular list back to the CPU
     virtual std::vector<idx_t> getListIndices(idx_t listId) const;
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index ac9594bdb7..a506086423 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -45,7 +45,7 @@ void RaftFlatIndex::query(
         faiss::MetricType metric,
         float metricArg,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool exactDistance) {
     // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used
     if (k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) {
@@ -59,7 +59,7 @@ void RaftFlatIndex::query(
                 vectors_.data(), vectors_.getSize(0), vectors_.getSize(1));
         auto search = raft::make_device_matrix_view<float>(
                 input.data(), input.getSize(0), input.getSize(1));
-        auto inds = raft::make_device_matrix_view<int>(
+        auto inds = raft::make_device_matrix_view<idx_t>(
                 outIndices.data(),
                 outIndices.getSize(0),
                 outIndices.getSize(1));
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index d947f9b21d..6befc8744e 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -52,7 +52,7 @@ class RaftFlatIndex : public FlatIndex {
             faiss::MetricType metric,
             float metricArg,
             Tensor<float, 2, true>& outDistances,
-            Tensor<int, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool exactDistance) override;
 };
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 3820beb2b1..56deaa7865 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -123,7 +123,7 @@ void RaftIVFFlat::search(
 /// The input data must be on our current device.
 /// Returns the number of vectors successfully added. Vectors may
 /// not be able to be added because they contain NaNs.
-int RaftIVFFlat::addVectors(
+idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
@@ -163,7 +163,7 @@ void RaftIVFFlat::reset() {
     raft_knn_index.reset();
 }
 
-int RaftIVFFlat::getListLength(int listId) const {
+idx_t RaftIVFFlat::getListLength(idx_t listId) const {
     printf("Inside RaftIVFFlat getListLength\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
@@ -181,7 +181,7 @@ int RaftIVFFlat::getListLength(int listId) const {
 }
 
 /// Return the list indices of a particular list back to the CPU
-std::vector<idx_t> RaftIVFFlat::getListIndices(int listId) const {
+std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     printf("Inside RaftIVFFlat getListIndices\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
@@ -213,7 +213,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(int listId) const {
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> RaftIVFFlat::getListVectorData(int listId, bool gpuFormat)
+std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat)
         const {
     printf("Inside RaftIVFFlat getListVectorData\n");
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index d0c17fd2e9..199e649eeb 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -76,7 +76,7 @@ class RaftIVFFlat : public IVFFlat {
     /// The input data must be on our current device.
     /// Returns the number of vectors successfully added. Vectors may
     /// not be able to be added because they contain NaNs.
-    int addVectors(
+    idx_t addVectors(
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<idx_t, 1, true>& indices) override;
@@ -87,13 +87,13 @@ class RaftIVFFlat : public IVFFlat {
 
     /// For debugging purposes, return the list length of a particular
     /// list
-    int getListLength(int listId) const override;
+    idx_t getListLength(idx_t listId) const override;
 
     /// Return the list indices of a particular list back to the CPU
-    std::vector<idx_t> getListIndices(int listId) const override;
+    std::vector<idx_t> getListIndices(idx_t listId) const override;
 
     /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat)
+    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
             const override;
 
     void updateQuantizer(Index* quantizer) override;

From 37ec2fa752c6bda2bdcdc9f4fa9605ee45ae0256 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <aguzhva@meta.com>
Date: Thu, 4 May 2023 06:28:00 -0700
Subject: [PATCH 58/87] Fix PR problems (#2839)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2839

Reviewed By: algoriddle

Differential Revision: D45054275

fbshipit-source-id: 12eba11f5fb09eb80a8620bd60d5bb74df9b9ceb
---
 faiss/gpu/GpuDistance.cu           |  2 --
 faiss/gpu/StandardGpuResources.cpp | 17 ++++++++---------
 faiss/gpu/impl/RaftFlatIndex.cu    |  1 -
 faiss/gpu/impl/RaftUtils.h         |  5 ++++-
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index daf4710aec..130c2454ef 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -293,7 +293,6 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                         search.view(),
                         inds.view(),
                         dists.view(),
-                        k,
                         distance,
                         metric_arg);
             }
@@ -328,7 +327,6 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     search.view(),
                     inds.view(),
                     dists.view(),
-                    k,
                     distance,
                     metric_arg);
         }
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index af0f24c51e..c1be2fbf37 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -89,7 +89,13 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 //
 
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
-        : pinnedMemAlloc_(nullptr),
+        :
+#if defined USE_NVIDIA_RAFT
+          cmr(new rmm::mr::cuda_memory_resource),
+          mmr(new rmm::mr::managed_memory_resource),
+          pmr(new rmm::mr::pinned_memory_resource),
+#endif
+          pinnedMemAlloc_(nullptr),
           pinnedMemAllocSize_(0),
           // let the adjustment function determine the memory size for us by
           // passing in a huge value that will then be adjusted
@@ -97,14 +103,7 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl()
                   -1,
                   std::numeric_limits<size_t>::max())),
           pinnedMemSize_(kDefaultPinnedMemoryAllocation),
-          allocLogging_(false)
-#if defined USE_NVIDIA_RAFT
-          ,
-          cmr(new rmm::mr::cuda_memory_resource),
-          mmr(new rmm::mr::managed_memory_resource),
-          pmr(new rmm::mr::pinned_memory_resource)
-#endif
-{
+          allocLogging_(false) {
 }
 
 StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index d407c68680..fb0f815368 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -109,7 +109,6 @@ void RaftFlatIndex::query(
                     search,
                     inds,
                     dists,
-                    k,
                     distance,
                     metricArg);
         }
diff --git a/faiss/gpu/impl/RaftUtils.h b/faiss/gpu/impl/RaftUtils.h
index 77c47999a5..6c744051ae 100644
--- a/faiss/gpu/impl/RaftUtils.h
+++ b/faiss/gpu/impl/RaftUtils.h
@@ -20,13 +20,16 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <faiss/MetricType.h>
+#include <raft/core/error.hpp>
 #include <raft/distance/distance_types.hpp>
 
 namespace faiss {
 namespace gpu {
 
-raft::distance::DistanceType faiss_to_raft(
+inline raft::distance::DistanceType faiss_to_raft(
         MetricType metric,
         bool exactDistance) {
     switch (metric) {

From d3a98cc31228d15e796c4fef4aa431cbef22fcf5 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 4 May 2023 13:56:06 -0400
Subject: [PATCH 59/87] Fixing cmakelists

---
 faiss/gpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index c742f8c49d..ff5d08878d 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -178,7 +178,7 @@ set(FAISS_GPU_HEADERS
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
-          impl/RaftIVFFlat.cuh)
+          impl/RaftIVFFlat.cuh
           impl/RaftFlatIndex.cuh)
   list(APPEND FAISS_GPU_SRC
           impl/RaftFlatIndex.cu

From 0af95a471135b88449128754cf194122403cb691 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 4 May 2023 15:08:41 -0400
Subject: [PATCH 60/87] Updates

---
 faiss/gpu/GpuIndexIVF.cu      |  2 +-
 faiss/gpu/impl/RaftIVFFlat.cu | 53 ++++++++++++++---------------------
 2 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 99a778e882..dfc9631f7e 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -449,7 +449,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
 
     if (config_.use_raft) {
         printf("Using raft to train quantizer for %d vectors\n", n);
-        const raft::handle_t& raft_handle =
+        const raft::device_resources& raft_handle =
                 resources_->getRaftHandleCurrentDevice();
 
         raft::neighbors::ivf_flat::index_params raft_idx_params;
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 56deaa7865..7b92f1df12 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -96,7 +96,7 @@ void RaftIVFFlat::search(
     FAISS_ASSERT(n > 0);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
 
-    const raft::handle_t& raft_handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     raft::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
@@ -109,12 +109,11 @@ void RaftIVFFlat::search(
             raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
     raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle,
+	    pams,
             *raft_knn_index,
             queries_view,
             out_inds_view,
-            out_dists_view,
-            pams,
-            k_);
+            out_dists_view);
 
     raft_handle.sync_stream();
 }
@@ -136,7 +135,7 @@ idx_t RaftIVFFlat::addVectors(
     auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(
             indices.data(), (idx_t)indices.getSize(0));
 
-    const raft::handle_t& raft_handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     printf("About to call extend on index\n");
@@ -145,11 +144,11 @@ idx_t RaftIVFFlat::addVectors(
     if (raft_knn_index.has_value()) {
         raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
                 raft_handle,
-                raft_knn_index.value(),
                 vecs_view,
                 std::make_optional<
                         raft::device_vector_view<const idx_t, idx_t>>(
-                        inds_view)));
+                        inds_view),
+	        raft_knn_index.value()));
 
     } else {
         printf("Index has not been trained!\n");
@@ -167,7 +166,7 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const {
     printf("Inside RaftIVFFlat getListLength\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t& raft_handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     uint32_t size;
@@ -185,17 +184,11 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     printf("Inside RaftIVFFlat getListIndices\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t& raft_handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    idx_t offset;
     uint32_t size;
 
-    raft::copy(
-            &offset,
-            raft_knn_index.value().list_offsets().data_handle() + listId,
-            1,
-            raft_handle.get_stream());
     raft::copy(
             &size,
             raft_knn_index.value().list_sizes().data_handle() + listId,
@@ -206,7 +199,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     std::vector<idx_t> vec(size);
     raft::copy(
             vec.data(),
-            raft_knn_index.value().indices().data_handle() + offset,
+            *(raft_knn_index.value().inds_ptrs().data_handle() + listId),
             size,
             raft_handle.get_stream());
     return vec;
@@ -218,31 +211,26 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat
     printf("Inside RaftIVFFlat getListVectorData\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::handle_t& raft_handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     std::cout << "Calling getListVectorData for " << listId << std::endl;
 
-    using elem_t = decltype(raft_knn_index.value().data())::element_type;
+    using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type;
     size_t dim = raft_knn_index.value().dim();
-    idx_t offsets[2];
-    raft::copy(
-            offsets,
-            raft_knn_index.value().list_offsets().data_handle() + listId,
-            2,
-            raft_handle.get_stream());
+    uint32_t list_size;
+    
+    raft::copy(&list_size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
+
 
-    raft_handle.sync_stream();
-    size_t byte_offset = offsets[0] * sizeof(elem_t) * dim;
     // the interleaved block can be slightly larger than the list size (it's
     // rounded up)
-    size_t byte_size = size_t(offsets[1]) * sizeof(elem_t) * dim - byte_offset;
+    size_t byte_size = size_t(list_size) * sizeof(elem_t) * dim;
     std::vector<uint8_t> vec(byte_size);
     raft::copy(
             vec.data(),
             reinterpret_cast<const uint8_t*>(
-                    raft_knn_index.value().data().data_handle()) +
-                    byte_offset,
+                    raft_knn_index.value().data_ptrs().data_handle()+listId),
             byte_size,
             raft_handle.get_stream());
     return vec;
@@ -269,14 +257,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     std::cout << "Calling RAFT updateQuantizer with trained index with "
               << quantizer_ntotal << " items" << std::endl;
-    const raft::handle_t& handle = resources_->getRaftHandleCurrentDevice();
+    const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice();
     auto stream = handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
 
     raft::logger::get().set_level(RAFT_LEVEL_TRACE);
 
-    raft::spatial::knn::ivf_flat::index_params pams;
+    raft::neighbors::ivf_flat::index_params pams;
     pams.add_data_on_build = false;
 
     switch (this->metric_) {
@@ -297,6 +285,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             pams.metric,
             (uint32_t)this->numLists_,
             false,
+	    false,
             (uint32_t)this->dim_);
 
     printf("Reconstructing\n");
@@ -327,7 +316,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 //    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 //
 //    printf("Inside RAFT copyInvertedListsFrom\n");
-//    raft::handle_t &handle = resources_->getRaftHandleCurrentDevice();
+//    raft::device_resources &handle = resources_->getRaftHandleCurrentDevice();
 //    // We need to allocate the IVF
 //    printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal);
 //

From 576f58fae3839e6c3de789df384483fe7ea45b8a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 28 Jun 2023 14:42:11 -0400
Subject: [PATCH 61/87] Fixing merge

---
 cmake/thirdparty/get_raft.cmake | 13 +++++
 faiss/gpu/GpuIndexIVFFlat.cu    | 15 ------
 faiss/gpu/GpuIndexIVFFlat.h     | 15 ------
 gpu/Makefile                    | 96 ---------------------------------
 4 files changed, 13 insertions(+), 126 deletions(-)
 delete mode 100644 gpu/Makefile

diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake
index 567ac4814f..df5aa448e4 100644
--- a/cmake/thirdparty/get_raft.cmake
+++ b/cmake/thirdparty/get_raft.cmake
@@ -2,6 +2,19 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+# =============================================================================
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 set(RAFT_VERSION "${RAPIDS_VERSION}")
 set(RAFT_FORK "rapidsai")
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 19f03e9247..f257b09952 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -4,21 +4,6 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index a4fa18417e..d7508feef4 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -4,21 +4,6 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
 
 #pragma once
 
diff --git a/gpu/Makefile b/gpu/Makefile
deleted file mode 100644
index 072e089ddd..0000000000
--- a/gpu/Makefile
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2015-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the BSD+Patents license found in the
-# LICENSE file in the root directory of this source tree.
-
--include ../makefile.inc
-
-all: libgpufaiss.a libgpufaiss.$(SHAREDEXT)
-
-CPPOBJ =    GpuResources.o \
-            IndexProxy.o \
-            StandardGpuResources.o \
-            GpuAutoTune.o \
-            GpuClonerOptions.o \
-            impl/RemapIndices.o \
-            utils/DeviceMemory.o \
-            utils/StackDeviceMemory.o \
-            utils/DeviceUtils.o \
-            utils/Timer.o \
-            utils/MemorySpace.o \
-            utils/WorkerThread.o
-
-
-INS = 1 32 64 128 256 F512 T512 F1024 T1024
-
-CUOBJ =     impl/BroadcastSum.o \
-            impl/Distance.o \
-            impl/FlatIndex.o \
-            impl/InvertedListAppend.o \
-            impl/IVFBase.o \
-            impl/IVFFlat.o \
-            impl/IVFFlatScan.o \
-            impl/IVFPQ.o \
-            impl/IVFUtils.o \
-            impl/IVFUtilsSelect1.o \
-            impl/IVFUtilsSelect2.o \
-            impl/L2Norm.o \
-            impl/L2Select.o \
-            impl/PQCodeDistances.o \
-            impl/PQScanMultiPassNoPrecomputed.o \
-            impl/PQScanMultiPassPrecomputed.o \
-            impl/VectorResidual.o \
-            GpuIndex.o \
-            GpuIndexFlat.o \
-            GpuIndexIVF.o \
-            GpuIndexIVFFlat.o \
-            GpuIndexIVFPQ.o  \
-            utils/Float16.o \
-            utils/MatrixMult.o \
-            utils/BlockSelectFloat.o \
-            utils/BlockSelectHalf.o \
-            utils/WarpSelectFloat.o \
-            utils/WarpSelectHalf.o \
-            utils/nvidia/fp16_emu.o \
-            $(foreach v,$(INS),   \
-                utils/blockselect/BlockSelectHalf$(v).o \
-                utils/blockselect/BlockSelectFloat$(v).o \
-                utils/warpselect/WarpSelectHalf$(v).o \
-                utils/warpselect/WarpSelectFloat$(v).o \
-             )
-
-%.o: %.cpp
-	$(CXX) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@ $(CUDACFLAGS)
-
-%.o: %.cu
-	$(NVCC) $(NVCCFLAGS) -g -O3 -c $< -o $@
-
-libgpufaiss.a: $(CPPOBJ) $(CUOBJ)
-	ar r $@ $^
-
-libgpufaiss.$(SHAREDEXT): $(CPPOBJ) $(CUOBJ)
-	$(CXX) $(SHAREDFLAGS) $(LDFLAGS) $(NVCCLDFLAGS) \
-		-o libgpufaiss.$(SHAREDEXT) $^ $(LIBS) $(NVCCLIBS)
-
-clean:
-	rm -rf *.o impl/*.o utils/*.o libgpufaiss.a \
-		libgpufaiss.$(SHAREDEXT) \
-
--include depend
-
-depend:
-	for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \
-		 $(patsubst %.o,%.cu,$(CUOBJ)); do \
-	    $(CXXCPP) $(CPPFLAGS) -x c++ -MM $$i; \
-	done > depend
-
-install: libgpufaiss.a libgpufaiss.$(SHAREDEXT) installdirs
-	cp libgpufaiss.a libgpufaiss.$(SHAREDEXT) $(DESTDIR)$(libdir)
-	cp *.h $(DESTDIR)$(includedir)/faiss/gpu
-	cp --parents **/**.h $(DESTDIR)$(includedir)/faiss/gpu
-
-installdirs:
-	$(MKDIR_P) $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss/gpu
-
-.PHONY: all clean

From 092721f1b36ff4cbd913e013828b5d0f5806879f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 28 Jun 2023 18:05:20 -0400
Subject: [PATCH 62/87] Removing indexflat tests from changeset

---
 cmake/thirdparty/fetch_rapids.cmake |    2 +-
 faiss/gpu/test/TestGpuIndexFlat.cpp | 1113 +++++++++++++--------------
 2 files changed, 555 insertions(+), 560 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 044a369606..229c488196 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "23.06")
+set(RAPIDS_VERSION "23.08")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 920336b0bc..fd63af0589 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -1,9 +1,9 @@
 /**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
+* Copyright (c) Facebook, Inc. and its affiliates.
+*
+* This source code is licensed under the MIT license found in the
+* LICENSE file in the root directory of this source tree.
+*/
 
 #include <faiss/IndexFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -21,755 +21,750 @@ constexpr float kF16MaxRelErr = 0.07f;
 constexpr float kF32MaxRelErr = 6e-3f;
 
 struct TestFlatOptions {
-    TestFlatOptions()
-            : metric(faiss::MetricType::METRIC_L2),
-              metricArg(0),
-              useFloat16(false),
-              numVecsOverride(-1),
-              numQueriesOverride(-1),
-              kOverride(-1),
-              dimOverride(-1),
-              use_raft(false) {}
-
-    faiss::MetricType metric;
-    float metricArg;
-
-    bool useFloat16;
-    int numVecsOverride;
-    int numQueriesOverride;
-    int kOverride;
-    int dimOverride;
-    bool use_raft;
+   TestFlatOptions()
+           : metric(faiss::MetricType::METRIC_L2),
+             metricArg(0),
+             useFloat16(false),
+             numVecsOverride(-1),
+             numQueriesOverride(-1),
+             kOverride(-1),
+             dimOverride(-1),
+             use_raft(false) {}
+
+   faiss::MetricType metric;
+   float metricArg;
+
+   bool useFloat16;
+   int numVecsOverride;
+   int numQueriesOverride;
+   int kOverride;
+   int dimOverride;
+   bool use_raft;
 };
 
 void testFlat(const TestFlatOptions& opt) {
-    int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride
-                                          : faiss::gpu::randVal(1000, 5000);
-    int dim = opt.dimOverride > 0 ? opt.dimOverride
-                                  : faiss::gpu::randVal(50, 800);
-    int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride
-                                              : faiss::gpu::randVal(1, 512);
-
-    // Due to loss of precision in a float16 accumulator, for large k,
-    // the number of differences is pretty huge. Restrict ourselves to a
-    // fairly small `k` for float16
-    int k = opt.useFloat16
-            ? std::min(faiss::gpu::randVal(1, 50), numVecs)
-            : std::min(
-                      faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()),
-                      numVecs);
-    if (opt.kOverride > 0) {
-        k = opt.kOverride;
-    }
-
-    faiss::IndexFlat cpuIndex(dim, opt.metric);
-    cpuIndex.metric_arg = opt.metricArg;
-
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_raft = true;
-    config.useFloat16 = opt.useFloat16;
-    config.use_raft = opt.use_raft;
-
-    faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
-    gpuIndex.metric_arg = opt.metricArg;
-
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndex.add(numVecs, vecs.data());
-    gpuIndex.add(numVecs, vecs.data());
-
-    std::stringstream str;
-    str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs "
-        << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16
-        << " numQuery " << numQuery << " k " << k;
-
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            str.str(),
-            opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            opt.useFloat16 ? 0.99f : 0.1f,
-            opt.useFloat16 ? 0.65f : 0.015f);
+   int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride
+                                         : faiss::gpu::randVal(1000, 5000);
+   int dim = opt.dimOverride > 0 ? opt.dimOverride
+                                 : faiss::gpu::randVal(50, 800);
+   int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride
+                                             : faiss::gpu::randVal(1, 512);
+
+   // Due to loss of precision in a float16 accumulator, for large k,
+   // the number of differences is pretty huge. Restrict ourselves to a
+   // fairly small `k` for float16
+   int k = opt.useFloat16
+           ? std::min(faiss::gpu::randVal(1, 50), numVecs)
+           : std::min(
+                     faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()),
+                     numVecs);
+   if (opt.kOverride > 0) {
+       k = opt.kOverride;
+   }
+
+   faiss::IndexFlat cpuIndex(dim, opt.metric);
+   cpuIndex.metric_arg = opt.metricArg;
+
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
+
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = device;
+   config.useFloat16 = opt.useFloat16;
+   config.use_raft = opt.use_raft;
+
+   faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
+   gpuIndex.metric_arg = opt.metricArg;
+
+   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+   cpuIndex.add(numVecs, vecs.data());
+   gpuIndex.add(numVecs, vecs.data());
+
+   std::stringstream str;
+   str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs "
+       << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16
+       << " numQuery " << numQuery << " k " << k;
+
+   // To some extent, we depend upon the relative error for the test
+   // for float16
+   faiss::gpu::compareIndices(
+           cpuIndex,
+           gpuIndex,
+           numQuery,
+           dim,
+           k,
+           str.str(),
+           opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+           // FIXME: the fp16 bounds are
+           // useless when math (the accumulator) is
+           // in fp16. Figure out another way to test
+           opt.useFloat16 ? 0.99f : 0.1f,
+           opt.useFloat16 ? 0.65f : 0.015f);
 }
 
 TEST(TestGpuIndexFlat, IP_Float32) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-        opt.useFloat16 = false;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
+       opt.useFloat16 = false;
 
-        testFlat(opt);
+       testFlat(opt);
 
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 TEST(TestGpuIndexFlat, L1_Float32) {
-    TestFlatOptions opt;
-    opt.metric = faiss::MetricType::METRIC_L1;
-    opt.useFloat16 = false;
+   TestFlatOptions opt;
+   opt.metric = faiss::MetricType::METRIC_L1;
+   opt.useFloat16 = false;
 
-    testFlat(opt);
+   testFlat(opt);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
-    testFlat(opt);
+   opt.use_raft = true;
+   testFlat(opt);
 #endif
 }
 
 TEST(TestGpuIndexFlat, Lp_Float32) {
-    TestFlatOptions opt;
-    opt.metric = faiss::MetricType::METRIC_Lp;
-    opt.metricArg = 5;
-    opt.useFloat16 = false;
+   TestFlatOptions opt;
+   opt.metric = faiss::MetricType::METRIC_Lp;
+   opt.metricArg = 5;
+   opt.useFloat16 = false;
 
-    testFlat(opt);
+   testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
-    testFlat(opt);
+   opt.use_raft = true;
+   testFlat(opt);
 #endif
 }
 
 TEST(TestGpuIndexFlat, L2_Float32) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
 
-        opt.useFloat16 = false;
+       opt.useFloat16 = false;
 
-        testFlat(opt);
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 // At least one test for the k > 1024 select
 TEST(TestGpuIndexFlat, L2_k_2048) {
-    if (faiss::gpu::getMaxKSelection() >= 2048) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.kOverride = 2048;
-        opt.dimOverride = 128;
-        opt.numVecsOverride = 10000;
-
-        testFlat(opt);
+   if (faiss::gpu::getMaxKSelection() >= 2048) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
+       opt.useFloat16 = false;
+       opt.kOverride = 2048;
+       opt.dimOverride = 128;
+       opt.numVecsOverride = 10000;
+
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 // test specialized k == 1 codepath
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.kOverride = 1;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
+       opt.useFloat16 = false;
+       opt.kOverride = 1;
 
-        testFlat(opt);
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 TEST(TestGpuIndexFlat, IP_Float16) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-        opt.useFloat16 = true;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
+       opt.useFloat16 = true;
 
-        testFlat(opt);
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 TEST(TestGpuIndexFlat, L2_Float16) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = true;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
+       opt.useFloat16 = true;
 
-        testFlat(opt);
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 // test specialized k == 1 codepath
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
-    for (int tries = 0; tries < 3; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = true;
-        opt.kOverride = 1;
+   for (int tries = 0; tries < 3; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
+       opt.useFloat16 = true;
+       opt.kOverride = 1;
 
-        testFlat(opt);
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 // test tiling along a huge vector set
 TEST(TestGpuIndexFlat, L2_Tiling) {
-    for (int tries = 0; tries < 2; ++tries) {
-        TestFlatOptions opt;
-        opt.metric = faiss::MetricType::METRIC_L2;
-        opt.useFloat16 = false;
-        opt.numVecsOverride = 1000000;
-
-        // keep the rest of the problem reasonably small
-        opt.numQueriesOverride = 4;
-        opt.dimOverride = 64;
-        opt.kOverride = 64;
-
-        testFlat(opt);
+   for (int tries = 0; tries < 2; ++tries) {
+       TestFlatOptions opt;
+       opt.metric = faiss::MetricType::METRIC_L2;
+       opt.useFloat16 = false;
+       opt.numVecsOverride = 1000000;
+
+       // keep the rest of the problem reasonably small
+       opt.numQueriesOverride = 4;
+       opt.dimOverride = 64;
+       opt.kOverride = 64;
+
+       testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
-        testFlat(opt);
+       opt.use_raft = true;
+       testFlat(opt);
 #endif
-    }
+   }
 }
 
 TEST(TestGpuIndexFlat, QueryEmpty) {
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
 
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = 0;
-    config.use_raft = true;
-    config.useFloat16 = false;
-    int dim = 128;
-    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = 0;
+   config.useFloat16 = false;
+   int dim = 128;
+   faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
-    // Querying an empty index should not blow up, and just return
-    // (FLT_MAX, -1)
-    int numQuery = 10;
-    int k = 50;
-    std::vector<float> queries(numQuery * dim, 1.0f);
+   // Querying an empty index should not blow up, and just return
+   // (FLT_MAX, -1)
+   int numQuery = 10;
+   int k = 50;
+   std::vector<float> queries(numQuery * dim, 1.0f);
 
-    std::vector<float> dist(numQuery * k, 0);
-    std::vector<faiss::idx_t> ind(numQuery * k);
+   std::vector<float> dist(numQuery * k, 0);
+   std::vector<faiss::idx_t> ind(numQuery * k);
 
-    gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
+   gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
 
-    for (auto d : dist) {
-        EXPECT_EQ(d, std::numeric_limits<float>::max());
-    }
+   for (auto d : dist) {
+       EXPECT_EQ(d, std::numeric_limits<float>::max());
+   }
 
-    for (auto i : ind) {
-        EXPECT_EQ(i, -1);
-    }
+   for (auto i : ind) {
+       EXPECT_EQ(i, -1);
+   }
 }
 
 void testCopyFrom(bool use_raft) {
-    int numVecs = faiss::gpu::randVal(100, 200);
-    int dim = faiss::gpu::randVal(1, 1000);
+   int numVecs = faiss::gpu::randVal(100, 200);
+   int dim = faiss::gpu::randVal(1, 1000);
 
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
 
-    faiss::IndexFlatL2 cpuIndex(dim);
-    cpuIndex.add(numVecs, vecs.data());
+   faiss::IndexFlatL2 cpuIndex(dim);
+   cpuIndex.add(numVecs, vecs.data());
 
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
 
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.use_raft = true;
-        config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+   for (bool useFloat16 : {false, true}) {
+       faiss::gpu::GpuIndexFlatConfig config;
+       config.device = device;
+       config.useFloat16 = useFloat16;
+       config.use_raft = use_raft;
 
-        // Fill with garbage values
-        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
-        gpuIndex.copyFrom(&cpuIndex);
+       // Fill with garbage values
+       faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
+       gpuIndex.copyFrom(&cpuIndex);
 
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, numVecs);
+       EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+       EXPECT_EQ(gpuIndex.ntotal, numVecs);
 
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, dim);
+       EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+       EXPECT_EQ(cpuIndex.d, dim);
 
-        std::vector<float> gpuVals(numVecs * dim);
-        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+       std::vector<float> gpuVals(numVecs * dim);
+       gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
 
-        std::vector<float> cpuVals(numVecs * dim);
-        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
+       std::vector<float> cpuVals(numVecs * dim);
+       cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
 
-        // The CPU is the source of (float32) truth here, while the GPU index
-        // may be in float16 mode and thus was subject to rounding
-        if (useFloat16) {
-            EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals));
-        } else {
-            // Should be exactly the same
-            EXPECT_EQ(gpuVals, cpuVals);
-        }
-    }
+       // The CPU is the source of (float32) truth here, while the GPU index
+       // may be in float16 mode and thus was subject to rounding
+       if (useFloat16) {
+           EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals));
+       } else {
+           // Should be exactly the same
+           EXPECT_EQ(gpuVals, cpuVals);
+       }
+   }
 }
 
 TEST(TestGpuIndexFlat, CopyFrom) {
-    testCopyFrom(false);
+   testCopyFrom(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, CopyFrom) {
-    testCopyFrom(true);
+   testCopyFrom(true);
 }
 #endif
 
 void testCopyTo(bool use_raft) {
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
 
-    int numVecs = faiss::gpu::randVal(100, 200);
-    int dim = faiss::gpu::randVal(1, 1000);
+   int numVecs = faiss::gpu::randVal(100, 200);
+   int dim = faiss::gpu::randVal(1, 1000);
 
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
 
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.use_raft = true;
-        config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+   for (bool useFloat16 : {false, true}) {
+       faiss::gpu::GpuIndexFlatConfig config;
+       config.device = device;
+       config.useFloat16 = useFloat16;
+       config.use_raft = use_raft;
 
-        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-        gpuIndex.add(numVecs, vecs.data());
+       faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+       gpuIndex.add(numVecs, vecs.data());
 
-        // Fill with garbage values
-        faiss::IndexFlatL2 cpuIndex(2000);
-        gpuIndex.copyTo(&cpuIndex);
+       // Fill with garbage values
+       faiss::IndexFlatL2 cpuIndex(2000);
+       gpuIndex.copyTo(&cpuIndex);
 
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, numVecs);
+       EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+       EXPECT_EQ(gpuIndex.ntotal, numVecs);
 
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, dim);
+       EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+       EXPECT_EQ(cpuIndex.d, dim);
 
-        std::vector<float> gpuVals(numVecs * dim);
-        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+       std::vector<float> gpuVals(numVecs * dim);
+       gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
 
-        std::vector<float> cpuVals(numVecs * dim);
-        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
+       std::vector<float> cpuVals(numVecs * dim);
+       cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
 
-        // The GPU is the source of truth here, so the float32 exact comparison
-        // even if the index uses float16 is ok
-        EXPECT_EQ(gpuVals, cpuVals);
-    }
+       // The GPU is the source of truth here, so the float32 exact comparison
+       // even if the index uses float16 is ok
+       EXPECT_EQ(gpuVals, cpuVals);
+   }
 }
 
 TEST(TestGpuIndexFlat, CopyTo) {
-    testCopyTo(false);
+   testCopyTo(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, CopyTo) {
-    testCopyTo(true);
+   testCopyTo(true);
 }
 #endif
 
 void testUnifiedMemory(bool use_raft) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-        return;
-    }
-
-    int dim = 256;
-
-    // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
-    // kernel indexing, so we can't test unified memory for memory
-    // oversubscription.
-    size_t numVecs = 50000;
-    int numQuery = 10;
-    int k = 10;
-
-    faiss::IndexFlatL2 cpuIndexL2(dim);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_raft = true;
-    config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_raft = use_raft;
-
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
-
-    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndexL2.add(numVecs, vecs.data());
-    gpuIndexL2.add(numVecs, vecs.data());
-
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndexL2,
-            gpuIndexL2,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+   if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+       return;
+   }
+
+   int dim = 256;
+
+   // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
+   // kernel indexing, so we can't test unified memory for memory
+   // oversubscription.
+   size_t numVecs = 50000;
+   int numQuery = 10;
+   int k = 10;
+
+   faiss::IndexFlatL2 cpuIndexL2(dim);
+
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
+
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = device;
+   config.memorySpace = faiss::gpu::MemorySpace::Unified;
+   config.use_raft = use_raft;
+
+   faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+
+   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+   cpuIndexL2.add(numVecs, vecs.data());
+   gpuIndexL2.add(numVecs, vecs.data());
+
+   // To some extent, we depend upon the relative error for the test
+   // for float16
+   faiss::gpu::compareIndices(
+           cpuIndexL2,
+           gpuIndexL2,
+           numQuery,
+           dim,
+           k,
+           "Unified Memory",
+           kF32MaxRelErr,
+           0.1f,
+           0.015f);
 }
 
 TEST(TestGpuIndexFlat, UnifiedMemory) {
-    testUnifiedMemory(false);
+   testUnifiedMemory(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, UnifiedMemory) {
-    testUnifiedMemory(true);
+   testUnifiedMemory(true);
 }
 #endif
 
 void testLargeIndex(bool use_raft) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
 
-    // Skip this device if we do not have sufficient memory
-    constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
+   // Skip this device if we do not have sufficient memory
+   constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
 
-    if (faiss::gpu::getFreeMemory(device) < kMem) {
-        std::cout << "TestGpuIndexFlat.LargeIndex: skipping due "
-                     "to insufficient device memory\n";
-        return;
-    }
+   if (faiss::gpu::getFreeMemory(device) < kMem) {
+       std::cout << "TestGpuIndexFlat.LargeIndex: skipping due "
+                    "to insufficient device memory\n";
+       return;
+   }
 
-    std::cout << "Running LargeIndex test\n";
+   std::cout << "Running LargeIndex test\n";
 
-    size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size
-    size_t nb = 5000000;
-    size_t nq = 10;
+   size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size
+   size_t nb = 5000000;
+   size_t nq = 10;
 
-    auto xb = faiss::gpu::randVecs(nb, dim);
+   auto xb = faiss::gpu::randVecs(nb, dim);
 
-    int k = 10;
+   int k = 10;
 
-    faiss::IndexFlatL2 cpuIndexL2(dim);
+   faiss::IndexFlatL2 cpuIndexL2(dim);
 
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_raft = use_raft;
-    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = device;
+   config.use_raft = use_raft;
+   faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
 
-    cpuIndexL2.add(nb, xb.data());
-    gpuIndexL2.add(nb, xb.data());
+   cpuIndexL2.add(nb, xb.data());
+   gpuIndexL2.add(nb, xb.data());
 
-    // To some extent, we depend upon the relative error for the test
-    // for float16
-    faiss::gpu::compareIndices(
-            cpuIndexL2,
-            gpuIndexL2,
-            nq,
-            dim,
-            k,
-            "LargeIndex",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
+   // To some extent, we depend upon the relative error for the test
+   // for float16
+   faiss::gpu::compareIndices(
+           cpuIndexL2,
+           gpuIndexL2,
+           nq,
+           dim,
+           k,
+           "LargeIndex",
+           kF32MaxRelErr,
+           0.1f,
+           0.015f);
 }
 
 TEST(TestGpuIndexFlat, LargeIndex) {
-    testLargeIndex(false);
+   testLargeIndex(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, LargeIndex) {
-    testLargeIndex(true);
+   testLargeIndex(true);
 }
 #endif
 
 void testResidual(bool use_raft) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
 
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_raft = use_raft;
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = device;
+   config.use_raft = use_raft;
 
-    int dim = 32;
-    faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
-    faiss::gpu::GpuIndexFlat gpuIndex(
-            &res, dim, faiss::MetricType::METRIC_L2, config);
+   int dim = 32;
+   faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
+   faiss::gpu::GpuIndexFlat gpuIndex(
+           &res, dim, faiss::MetricType::METRIC_L2, config);
 
-    int numVecs = 100;
-    auto vecs = faiss::gpu::randVecs(numVecs, dim);
-    cpuIndex.add(numVecs, vecs.data());
-    gpuIndex.add(numVecs, vecs.data());
+   int numVecs = 100;
+   auto vecs = faiss::gpu::randVecs(numVecs, dim);
+   cpuIndex.add(numVecs, vecs.data());
+   gpuIndex.add(numVecs, vecs.data());
 
-    auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
-    auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
+   auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
+   auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
 
-    auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
-    auto residualsGpu = std::vector<float>(indexVecs.size() * dim);
+   auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
+   auto residualsGpu = std::vector<float>(indexVecs.size() * dim);
 
-    cpuIndex.compute_residual_n(
-            indexVecs.size(),
-            queryVecs.data(),
-            residualsCpu.data(),
-            indexVecs.data());
-    gpuIndex.compute_residual_n(
-            indexVecs.size(),
-            queryVecs.data(),
-            residualsGpu.data(),
-            indexVecs.data());
+   cpuIndex.compute_residual_n(
+           indexVecs.size(),
+           queryVecs.data(),
+           residualsCpu.data(),
+           indexVecs.data());
+   gpuIndex.compute_residual_n(
+           indexVecs.size(),
+           queryVecs.data(),
+           residualsGpu.data(),
+           indexVecs.data());
 
-    // Should be exactly the same, as this is just a single float32 subtraction
-    EXPECT_EQ(residualsCpu, residualsGpu);
+   // Should be exactly the same, as this is just a single float32 subtraction
+   EXPECT_EQ(residualsCpu, residualsGpu);
 }
 
 TEST(TestGpuIndexFlat, Residual) {
-    testResidual(false);
+   testResidual(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, Residual) {
-    testResidual(true);
+   testResidual(true);
 }
 #endif
 
 void testReconstruct(bool use_raft) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    int dim = 32;
-    int numVecs = 100;
-    auto vecs = faiss::gpu::randVecs(numVecs, dim);
-    auto vecs16 = faiss::gpu::roundToHalf(vecs);
-
-    for (bool useFloat16 : {false, true}) {
-        faiss::gpu::GpuIndexFlatConfig config;
-        config.device = device;
-        config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
-
-        faiss::gpu::GpuIndexFlat gpuIndex(
-                &res, dim, faiss::MetricType::METRIC_L2, config);
-
-        gpuIndex.add(numVecs, vecs.data());
-
-        // Test reconstruct
-        {
-            auto reconstructVecs = std::vector<float>(dim);
-            gpuIndex.reconstruct(15, reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < dim; ++i) {
-                EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]);
-            }
-        }
-
-        // Test reconstruct_n
-        if (false) {
-            auto reconstructVecs = std::vector<float>((numVecs - 1) * dim);
-
-            int startVec = 5;
-            int endVec = numVecs - 1;
-            int numReconstructVec = endVec - startVec + 1;
-
-            gpuIndex.reconstruct_n(
-                    startVec, numReconstructVec, reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < numReconstructVec; ++i) {
-                for (int j = 0; j < dim; ++j) {
-                    EXPECT_EQ(
-                            reconstructVecs[i * dim + j],
-                            ref[(i + startVec) * dim + j]);
-                }
-            }
-        }
-
-        // Test reconstruct_batch
-        if (false) {
-            auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
-            auto reconstructVecs =
-                    std::vector<float>(reconstructKeys.size() * dim);
-
-            gpuIndex.reconstruct_batch(
-                    reconstructKeys.size(),
-                    reconstructKeys.data(),
-                    reconstructVecs.data());
-
-            auto& ref = useFloat16 ? vecs16 : vecs;
-
-            for (int i = 0; i < reconstructKeys.size(); ++i) {
-                for (int j = 0; j < dim; ++j) {
-                    EXPECT_EQ(
-                            reconstructVecs[i * dim + j],
-                            ref[reconstructKeys[i] * dim + j]);
-                }
-            }
-        }
-    }
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
+
+   int dim = 32;
+   int numVecs = 100;
+   auto vecs = faiss::gpu::randVecs(numVecs, dim);
+   auto vecs16 = faiss::gpu::roundToHalf(vecs);
+
+   for (bool useFloat16 : {false, true}) {
+       faiss::gpu::GpuIndexFlatConfig config;
+       config.device = device;
+       config.useFloat16 = useFloat16;
+       config.use_raft = use_raft;
+
+       faiss::gpu::GpuIndexFlat gpuIndex(
+               &res, dim, faiss::MetricType::METRIC_L2, config);
+
+       gpuIndex.add(numVecs, vecs.data());
+
+       // Test reconstruct
+       {
+           auto reconstructVecs = std::vector<float>(dim);
+           gpuIndex.reconstruct(15, reconstructVecs.data());
+
+           auto& ref = useFloat16 ? vecs16 : vecs;
+
+           for (int i = 0; i < dim; ++i) {
+               EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]);
+           }
+       }
+
+       // Test reconstruct_n
+       if (false) {
+           auto reconstructVecs = std::vector<float>((numVecs - 1) * dim);
+
+           int startVec = 5;
+           int endVec = numVecs - 1;
+           int numReconstructVec = endVec - startVec + 1;
+
+           gpuIndex.reconstruct_n(
+                   startVec, numReconstructVec, reconstructVecs.data());
+
+           auto& ref = useFloat16 ? vecs16 : vecs;
+
+           for (int i = 0; i < numReconstructVec; ++i) {
+               for (int j = 0; j < dim; ++j) {
+                   EXPECT_EQ(
+                           reconstructVecs[i * dim + j],
+                           ref[(i + startVec) * dim + j]);
+               }
+           }
+       }
+
+       // Test reconstruct_batch
+       if (false) {
+           auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
+           auto reconstructVecs =
+                   std::vector<float>(reconstructKeys.size() * dim);
+
+           gpuIndex.reconstruct_batch(
+                   reconstructKeys.size(),
+                   reconstructKeys.data(),
+                   reconstructVecs.data());
+
+           auto& ref = useFloat16 ? vecs16 : vecs;
+
+           for (int i = 0; i < reconstructKeys.size(); ++i) {
+               for (int j = 0; j < dim; ++j) {
+                   EXPECT_EQ(
+                           reconstructVecs[i * dim + j],
+                           ref[reconstructKeys[i] * dim + j]);
+               }
+           }
+       }
+   }
 }
 
 TEST(TestGpuIndexFlat, Reconstruct) {
-    testReconstruct(false);
+   testReconstruct(false);
 }
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, Reconstruct) {
-    testReconstruct(true);
+   testReconstruct(true);
 }
 #endif
 
 void testSearchAndReconstruct(bool use_raft) {
-    // Construct on a random device to test multi-device, if we have
-    // multiple devices
-    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-    faiss::gpu::StandardGpuResources res;
-    res.noTempMemory();
-
-    size_t dim = 32;
-    size_t nb = 5000;
-    size_t nq = 10;
-    int k = 10;
-
-    auto xb = faiss::gpu::randVecs(nb, dim);
-    auto xq = faiss::gpu::randVecs(nq, dim);
-
-    faiss::IndexFlatL2 cpuIndex(dim);
-
-    faiss::gpu::GpuIndexFlatConfig config;
-    config.device = device;
-    config.use_raft = use_raft;
-    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-
-    cpuIndex.add(nb, xb.data());
-    gpuIndex.add(nb, xb.data());
-
-    std::vector<float> refDistance(nq * k, 0);
-    std::vector<faiss::idx_t> refIndices(nq * k, -1);
-    std::vector<float> refReconstruct(nq * k * dim, 0);
-    cpuIndex.search_and_reconstruct(
-            nq,
-            xq.data(),
-            k,
-            refDistance.data(),
-            refIndices.data(),
-            refReconstruct.data());
-
-    std::vector<float> testDistance(nq * k, 0);
-    std::vector<faiss::idx_t> testIndices(nq * k, -1);
-    std::vector<float> testReconstruct(nq * k * dim, 0);
-    gpuIndex.search_and_reconstruct(
-            nq,
-            xq.data(),
-            k,
-            testDistance.data(),
-            testIndices.data(),
-            testReconstruct.data());
-
-    // This handles the search results
-    faiss::gpu::compareLists(
-            refDistance.data(),
-            refIndices.data(),
-            testDistance.data(),
-            testIndices.data(),
-            nq,
-            k,
-            "SearchAndReconstruct",
-            true,
-            false,
-            true,
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
-
-    // As the search results may be slightly different (though compareLists
-    // above will ensure a decent number of matches), reconstruction should be
-    // the same for the vectors that do match
-    for (int i = 0; i < nq; ++i) {
-        std::unordered_map<faiss::idx_t, int> refLocation;
-
-        for (int j = 0; j < k; ++j) {
-            refLocation.insert(std::make_pair(refIndices[i * k + j], j));
-        }
-
-        for (int j = 0; j < k; ++j) {
-            auto idx = testIndices[i * k + j];
-            auto it = refLocation.find(idx);
-            if (it != refLocation.end()) {
-                for (int d = 0; d < dim; ++d) {
-                    EXPECT_EQ(
-                            refReconstruct[(i * k + it->second) * dim + d],
-                            testReconstruct[(i * k + j) * dim + d]);
-                }
-            }
-        }
-    }
+   // Construct on a random device to test multi-device, if we have
+   // multiple devices
+   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+   faiss::gpu::StandardGpuResources res;
+   res.noTempMemory();
+
+   size_t dim = 32;
+   size_t nb = 5000;
+   size_t nq = 10;
+   int k = 10;
+
+   auto xb = faiss::gpu::randVecs(nb, dim);
+   auto xq = faiss::gpu::randVecs(nq, dim);
+
+   faiss::IndexFlatL2 cpuIndex(dim);
+
+   faiss::gpu::GpuIndexFlatConfig config;
+   config.device = device;
+   config.use_raft = use_raft;
+   faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+
+   cpuIndex.add(nb, xb.data());
+   gpuIndex.add(nb, xb.data());
+
+   std::vector<float> refDistance(nq * k, 0);
+   std::vector<faiss::idx_t> refIndices(nq * k, -1);
+   std::vector<float> refReconstruct(nq * k * dim, 0);
+   cpuIndex.search_and_reconstruct(
+           nq,
+           xq.data(),
+           k,
+           refDistance.data(),
+           refIndices.data(),
+           refReconstruct.data());
+
+   std::vector<float> testDistance(nq * k, 0);
+   std::vector<faiss::idx_t> testIndices(nq * k, -1);
+   std::vector<float> testReconstruct(nq * k * dim, 0);
+   gpuIndex.search_and_reconstruct(
+           nq,
+           xq.data(),
+           k,
+           testDistance.data(),
+           testIndices.data(),
+           testReconstruct.data());
+
+   // This handles the search results
+   faiss::gpu::compareLists(
+           refDistance.data(),
+           refIndices.data(),
+           testDistance.data(),
+           testIndices.data(),
+           nq,
+           k,
+           "SearchAndReconstruct",
+           true,
+           false,
+           true,
+           kF32MaxRelErr,
+           0.1f,
+           0.015f);
+
+   // As the search results may be slightly different (though compareLists
+   // above will ensure a decent number of matches), reconstruction should be
+   // the same for the vectors that do match
+   for (int i = 0; i < nq; ++i) {
+       std::unordered_map<faiss::idx_t, int> refLocation;
+
+       for (int j = 0; j < k; ++j) {
+           refLocation.insert(std::make_pair(refIndices[i * k + j], j));
+       }
+
+       for (int j = 0; j < k; ++j) {
+           auto idx = testIndices[i * k + j];
+           auto it = refLocation.find(idx);
+           if (it != refLocation.end()) {
+               for (int d = 0; d < dim; ++d) {
+                   EXPECT_EQ(
+                           refReconstruct[(i * k + it->second) * dim + d],
+                           testReconstruct[(i * k + j) * dim + d]);
+               }
+           }
+       }
+   }
 }
 
 TEST(TestGpuIndexFlat, SearchAndReconstruct) {
-    testSearchAndReconstruct(false);
+   testSearchAndReconstruct(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) {
-    testSearchAndReconstruct(true);
+   testSearchAndReconstruct(true);
 }
 #endif
 
 int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
+   testing::InitGoogleTest(&argc, argv);
 
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
+   // just run with a fixed test seed
+   faiss::gpu::setTestSeed(100);
 
-    return RUN_ALL_TESTS();
-}
+   return RUN_ALL_TESTS();
+}
\ No newline at end of file

From 1c621add67996dbdf5e420fb3246cd74f20d38c5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 7 Jul 2023 15:47:53 -0700
Subject: [PATCH 63/87] First version of copyFrom and copyTo

---
 faiss/gpu/impl/RaftIVFFlat.cu  | 330 ++++++++++++++++++++-------------
 faiss/gpu/impl/RaftIVFFlat.cuh |  63 +++++--
 2 files changed, 254 insertions(+), 139 deletions(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 7b92f1df12..645c4c4840 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -45,6 +45,10 @@
 #include <limits>
 #include <unordered_map>
 
+
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_flat_helpers.cuh>
+
 #include <raft/core/logger.hpp>
 
 namespace faiss {
@@ -187,14 +191,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    uint32_t size;
-
-    raft::copy(
-            &size,
-            raft_knn_index.value().list_sizes().data_handle() + listId,
-            1,
-            raft_handle.get_stream());
-    raft_handle.sync_stream();
+    uint32_t size = getListLength(listId);
 
     std::vector<idx_t> vec(size);
     raft::copy(
@@ -202,6 +199,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
             *(raft_knn_index.value().inds_ptrs().data_handle() + listId),
             size,
             raft_handle.get_stream());
+    raft_handle.sync_stream();
     return vec;
 }
 
@@ -218,22 +216,22 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat
 
     using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type;
     size_t dim = raft_knn_index.value().dim();
-    uint32_t list_size;
-    
-    raft::copy(&list_size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream());
-
+    idx_t list_size = getListLength(listId);
 
     // the interleaved block can be slightly larger than the list size (it's
     // rounded up)
-    size_t byte_size = size_t(list_size) * sizeof(elem_t) * dim;
-    std::vector<uint8_t> vec(byte_size);
-    raft::copy(
-            vec.data(),
-            reinterpret_cast<const uint8_t*>(
-                    raft_knn_index.value().data_ptrs().data_handle()+listId),
-            byte_size,
-            raft_handle.get_stream());
-    return vec;
+    size_t nblocks = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize);
+    size_t interleavedCodeSize = nblocks * raft::neighbors::ivf_flat::kIndexGroupSize * dim * sizeof(elem_t);
+    size_t flat_code_size = list_size * dim * sizeof(elem_t);
+    std::vector<uint8_t> interleaved_codes(interleavedCodeSize);
+    std::vector<uint8_t> flat_codes(flat_code_size);
+
+    RaftIVFFlatCodePackerFlat p(resources_, interleavedCodeSize);
+    p.unpack_1(reinterpret_cast<const uint8_t*>(
+                    raft_knn_index.value().data_ptrs().data_handle()+listId), 0, interleaved_codes.data());
+    RaftIVFFlatCodePackerInterleaved up((size_t)list_size, (size_t)dim, (size_t)raft_knn_index.value().veclen());
+    up.unpack_1(interleaved_codes.data(), 0, flat_codes.data());
+    return flat_codes;
 }
 
 /// Performs search when we are already given the IVF cells to look at
@@ -311,117 +309,201 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
 //
 //
-// void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-//    size_t nlist = ivf ? ivf->nlist : 0;
-//    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
-//
-//    printf("Inside RAFT copyInvertedListsFrom\n");
-//    raft::device_resources &handle = resources_->getRaftHandleCurrentDevice();
-//    // We need to allocate the IVF
-//    printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal);
-//
-//    std::vector<std::uint32_t> list_sizes_(nlist);
-//    std::vector<Index::idx_t> list_offsets_(nlist+1);
-//    std::vector<Index::idx_t> indices_(ntotal);
-//
-//    raft::neighbors::ivf_flat::index_params raft_idx_params;
-//    raft_idx_params.n_lists = nlist;
-//    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-//    raft_idx_params.add_data_on_build = false;
-//    raft_idx_params.kmeans_n_iters = 100;
+void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+   size_t nlist = ivf ? ivf->nlist : 0;
+   size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
+
+   raft::device_resources &handle = resources_->getRaftHandleCurrentDevice();
+
+   std::vector<std::uint32_t> list_sizes_(nlist);
+   std::vector<idx_t> list_offsets_(nlist+1);
+   std::vector<idx_t> indices_(ntotal);
+
+   raft::neighbors::ivf_flat::index_params raft_idx_params;
+   raft_idx_params.n_lists = nlist;
+   raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+   raft_idx_params.add_data_on_build = false;
+   raft_idx_params.kmeans_n_iters = 100;
 //
-//    raft_knn_index.emplace(handle, raft_idx_params, dim_);
+   raft_knn_index.emplace(handle, raft_idx_params, dim_);
 //    raft_knn_index.value().allocate(handle, ntotal, true);
 //
-//    for (size_t i = 0; i < nlist; ++i) {
-//        size_t listSize = ivf->list_size(i);
-//
-//        // GPU index can only support max int entries per list
-//        FAISS_THROW_IF_NOT_FMT(
-//                listSize <= (size_t)std::numeric_limits<int>::max(),
-//                "GPU inverted list can only support "
-//                "%zu entries; %zu found",
-//                (size_t)std::numeric_limits<int>::max(),
-//                listSize);
+   for (size_t i = 0; i < nlist; ++i) {
+       size_t listSize = ivf->list_size(i);
+       list_sizes_[i] = listSize;
+
+       // GPU index can only support max int entries per list
+       FAISS_THROW_IF_NOT_FMT(
+               listSize <= (size_t)std::numeric_limits<int>::max(),
+               "GPU inverted list can only support "
+               "%zu entries; %zu found",
+               (size_t)std::numeric_limits<int>::max(),
+               listSize);
+
+       addEncodedVectorsToList_(
+               i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+   }
 //
-//        addEncodedVectorsToList_(
-//                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
-//    }
-//
-//    raft::update_device(raft_knn_index.value().list_sizes().data_handle(),
-//    list_sizes_.data(), nlist, handle.get_stream());
+   raft::update_device(raft_knn_index.value().list_sizes().data_handle(),
+   list_sizes_.data(), nlist, handle.get_stream());
 //    raft::update_device(raft_knn_index.value().list_offsets().data_handle(),
 //    list_offsets_.data(), nlist+1, handle.get_stream());
 //
-//}
-
-// void RaftIVFFlat::addEncodedVectorsToList_(
-//        int listId,
-//        const void* codes,
-//        const Index::idx_t* indices,
-//        size_t numVecs) {
-//    auto stream = resources_->getDefaultStreamCurrentDevice();
-//
-//    // This list must already exist
-////    FAISS_ASSERT(listId < deviceListData_.size());
-//
-//    // This list must currently be empty
-////    auto& listCodes = deviceListData_[listId];
-////    FAISS_ASSERT(listCodes->data.size() == 0);
-////    FAISS_ASSERT(listCodes->numVecs == 0);
-//
-//    // If there's nothing to add, then there's nothing we have to do
-//    if (numVecs == 0) {
-//        return;
-//    }
-//
-//    // The GPU might have a different layout of the memory
-//    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-//    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
-//
-//    // We only have int32 length representations on the GPU per each
-//    // list; the length is in sizeof(char)
-//    FAISS_ASSERT(gpuListSizeInBytes <=
-//    (size_t)std::numeric_limits<int>::max());
-//
-//    // Translate the codes as needed to our preferred form
-//    std::vector<uint8_t> codesV(cpuListSizeInBytes);
-//    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
+}
+
+size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
+        idx_t bits = 32 /* float */;
+
+        // bytes to encode a block of 32 vectors (single dimension)
+        idx_t bytesPerDimBlock = bits * 32 / 8; // = 128
+
+        // bytes to fully encode 32 vectors
+        idx_t bytesPerBlock = bytesPerDimBlock * dim_;
+
+        // number of blocks of 32 vectors we have
+        idx_t numBlocks = utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize);
+
+        // total size to encode numVecs
+        return bytesPerBlock * numBlocks;
+}
+
+
+void RaftIVFFlat::addEncodedVectorsToList_(
+            idx_t listId,
+            const void* codes,
+            const idx_t* indices,
+            idx_t numVecs) {
+   auto stream = resources_->getDefaultStreamCurrentDevice();
+
+   // This list must already exist
+   FAISS_ASSERT(raft_knn_index.has_value());
+
+   // This list must currently be empty
+   FAISS_ASSERT(getListLength(listId) == 0);
+
+   // If there's nothing to add, then there's nothing we have to do
+   if (numVecs == 0) {
+       return;
+   }
+
+   // The GPU might have a different layout of the memory
+   auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
+   auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
+
+  // We only have int32 length representations on the GPU per each
+  // list; the length is in sizeof(char)
+   FAISS_ASSERT(gpuListSizeInBytes <=
+   (size_t)std::numeric_limits<int>::max());
+
+   // Translate the codes as needed to our preferred form
+   std::vector<uint8_t> codesV(cpuListSizeInBytes);
+   std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
 //    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
-//
-//    std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" <<
-//    gpuListSizeInBytes << std::endl;
-//
-////
-/// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(),
-/// translatedCodes.data(), ))
-//
-////    listCodes->data.append(
-////            translatedCodes.data(),
-////            gpuListSizeInBytes,
-////            stream,
-////            true /* exact reserved size */);
-////    listCodes->numVecs = numVecs;
-////
-////    // Handle the indices as well
-////    addIndicesFromCpu_(listId, indices, numVecs);
-////
-//
-//      // We should problay consider using this...
-////    deviceListDataPointers_.setAt(
-////            listId, (void*)listCodes->data.data(), stream);
-////    deviceListLengths_.setAt(listId, (int)numVecs, stream);
-////
-////    // We update this as well, since the multi-pass algorithm uses it
-////    maxListLength_ = std::max(maxListLength_, (int)numVecs);
-//}
-
-///// Copy all inverted lists from ourselves to a CPU representation
-// void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
-//    printf("Inside RaftIVFFlat copyInvertedListsTo\n");
-//
-//    // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu
-//}
+
+   std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" <<
+   gpuListSizeInBytes << std::endl;
+//    utils::divUp(numVecs, 32);
+   RaftIVFFlatCodePackerInterleaved transform_packer((size_t)numVecs, (size_t)dim_, (size_t)raft_knn_index.value().veclen());
+   std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
+   std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
+   transform_packer.pack_1(codesV.data(), 0, interleaved_codes.data());
+   RaftIVFFlatCodePackerFlat copy_packer(resources_, cpuListSizeInBytes);
+   copy_packer.unpack_1(interleaved_codes.data(), 0, (uint8_t*)(raft_knn_index.value().data_ptrs().data_handle() + listId));
+
+   uint32_t size = numVecs;
+   raft::update_device(raft_knn_index.value().list_sizes().data_handle() + listId, &size, 1, stream);
+
+    // Handle the indices as well
+    raft::update_device((idx_t*)(raft_knn_index.value().inds_ptrs().data_handle() + listId), indices, numVecs, stream);
+}
+/// Copy all inverted lists from ourselves to a CPU representation
+void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
+   printf("Inside RaftIVFFlat copyInvertedListsTo\n");
+
+    for (idx_t i = 0; i < numLists_; ++i) {
+        auto listIndices = getListIndices(i);
+        auto listData = getListVectorData(i, false);
+
+        ivf->add_entries(
+                i, listIndices.size(), listIndices.data(), listData.data());
+    }
+}
+
+// std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
+//         std::vector<uint8_t> codes,
+//         std::vector<uint8_t> block,
+//         idx_t numVecs) const {
+//     if (!interleavedLayout_) {
+//         // same format
+//         return codes;
+//     }
+//     RaftIVFFlatCodePackerInterleaved packer;
+//     packer::pack_all(codes.data(), block.data());
+// }
+
+// std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
+//         std::vector<uint8_t> codes,
+//         idx_t numVecs) const {
+//     if (!interleavedLayout_) {
+//         // same format
+//         return codes;
+//     }
+
+//     RaftIVFFlatCodePackerFlat packer;
+//     packer::unpack_all(block.data(), codes.data());
+// }
+
+
+
+RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen) {
+    this->list_size = list_size;
+    this->dim = dim;
+    this->veclen = veclen;
+    nvec = 1;
+    code_size = list_size * dim * sizeof(uint32_t);
+    block_size = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize);
+}
+
+void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
+        FAISS_ASSERT(offset == 0);
+    raft::neighbors::ivf_flat::helpers::pack_host_interleaved(
+        flat_code,
+        block,
+        nvec,
+        dim,
+        veclen);
+}
+
+void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const {
+        FAISS_ASSERT(offset == 0);
+    raft::neighbors::ivf_flat::helpers::unpack_host_interleaved(
+        block,
+        flat_code,
+        nvec,
+        dim,
+        veclen);
+}
+
+RaftIVFFlatCodePackerFlat::RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size) {
+    this->resources = resources_;
+    nvec = 1;
+    code_size = code_size;
+    block_size = code_size;
+}
+
+void RaftIVFFlatCodePackerFlat::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
+        FAISS_ASSERT(offset == 0);
+    const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice();
+    raft::update_device(block, flat_code, code_size * nvec, raft_handle.get_stream());
+}
+
+void RaftIVFFlatCodePackerFlat::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const {
+        FAISS_ASSERT(offset == 0);
+    const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice();
+    raft::update_host(flat_code, block, code_size * nvec, raft_handle.get_stream());
+    raft_handle.sync_stream();
+}
+
 
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 199e649eeb..8f81bd8c42 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -27,6 +27,9 @@
 
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
+#include <faiss/gpu/impl/IVFBase.cuh>
+
+#include <faiss/impl/CodePacker.h>
 
 #include <optional>
 
@@ -98,27 +101,57 @@ class RaftIVFFlat : public IVFFlat {
 
     void updateQuantizer(Index* quantizer) override;
 
-    //
-    //    /// Copy all inverted lists from a CPU representation to ourselves
-    //    void copyInvertedListsFrom(const InvertedLists* ivf) override;
-    //
-    //    /// Copy all inverted lists from ourselves to a CPU representation
-    //    void copyInvertedListsTo(InvertedLists* ivf) override;
+    /// Copy all inverted lists from a CPU representation to ourselves
+    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+    
+    /// Copy all inverted lists from ourselves to a CPU representation
+    void copyInvertedListsTo(InvertedLists* ivf) override;
 
    protected:
-    //    /// Adds a set of codes and indices to a list, with the representation
-    //    /// coming from the CPU equivalent
-    //    void addEncodedVectorsToList_(
-    //            int listId,
-    //            // resident on the host
-    //            const void* codes,
-    //            // resident on the host
-    //            const Index::idx_t* indices,
-    //            size_t numVecs) override;
+       /// Adds a set of codes and indices to a list, with the representation
+       /// coming from the CPU equivalent
+       void addEncodedVectorsToList_(
+            idx_t listId,
+            // resident on the host
+            const void* codes,
+            // resident on the host
+            const idx_t* indices,
+            idx_t numVecs) override;
+        
+        /// Returns the number of bytes in which an IVF list containing numVecs
+    /// vectors is encoded on the device. Note that due to padding this is not
+    /// the same as the encoding size for a subset of vectors in an IVF list;
+    /// this is the size for an entire IVF list
+    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
 
     std::optional<raft::neighbors::ivf_flat::index<float, idx_t>>
             raft_knn_index{std::nullopt};
 };
 
+
+struct RaftIVFFlatCodePackerInterleaved : CodePacker {
+    RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen);
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+
+    protected:
+        size_t list_size;
+        size_t veclen;
+        size_t dim;
+};
+struct RaftIVFFlatCodePackerFlat : CodePacker {
+    RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size);
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+    
+    protected:
+        GpuResources* resources;
+};
+
+
 } // namespace gpu
 } // namespace faiss

From b8d616d97b8cbfe6cda6685ffe53dae8089a995d Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 24 Jul 2023 18:11:26 -0700
Subject: [PATCH 64/87] Update copyFrom and copyTo

---
 faiss/gpu/GpuIndexIVFFlat.cu   |  21 +--
 faiss/gpu/impl/IVFBase.cu      |   1 +
 faiss/gpu/impl/RaftIVFFlat.cu  | 234 +++++++++++++++++++--------------
 faiss/gpu/impl/RaftIVFFlat.cuh |  18 +--
 4 files changed, 139 insertions(+), 135 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index f257b09952..7d2a94d595 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -177,24 +177,9 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
 
-    if (config_.use_raft) {
-        printf("Reconstructing %d original vectors and adding to GPU index\n",
-               ntotal);
-
-        // Quantizer should already have been updated above. Add reconstructed
-        // vectors to raft index
-        if (ntotal > 0) {
-            std::vector<float> buf_host(ntotal * d);
-            std::vector<idx_t> ids(ntotal);
-            std::iota(ids.begin(), ids.end(), 0);
-            index->reconstruct_n(0, ntotal, buf_host.data());
-            add_with_ids(ntotal, buf_host.data(), ids.data());
-        }
-    } else {
-        // Copy all of the IVF data
-        printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
-        index_->copyInvertedListsFrom(index->invlists);
-    }
+    // Copy all of the IVF data
+    printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
+    index_->copyInvertedListsFrom(index->invlists);
 }
 
 void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 890d489440..1e2f414fc0 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -323,6 +323,7 @@ std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
 }
 
 void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
+    printf("inside ivf-flat's copyInvertedListsFrom\n");
     idx_t nlist = ivf ? ivf->nlist : 0;
     for (idx_t i = 0; i < nlist; ++i) {
         addEncodedVectorsToList_(
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 645c4c4840..e543d0ff5f 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -190,15 +190,19 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
 
-    uint32_t size = getListLength(listId);
+    idx_t listSize = getListLength(listId);
 
-    std::vector<idx_t> vec(size);
-    raft::copy(
-            vec.data(),
-            *(raft_knn_index.value().inds_ptrs().data_handle() + listId),
-            size,
-            raft_handle.get_stream());
+    std::vector<idx_t> vec(listSize);
+
+    idx_t* list_indices_ptr;
+
+    // fetch the list indices ptr on host
+    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft_handle.sync_stream();
+
+    raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
     raft_handle.sync_stream();
     return vec;
 }
@@ -209,28 +213,33 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat
     printf("Inside RaftIVFFlat getListVectorData\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
+
+    const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
 
     std::cout << "Calling getListVectorData for " << listId << std::endl;
 
-    using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type;
-    size_t dim = raft_knn_index.value().dim();
-    idx_t list_size = getListLength(listId);
+    idx_t listSize = getListLength(listId);
 
     // the interleaved block can be slightly larger than the list size (it's
     // rounded up)
-    size_t nblocks = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize);
-    size_t interleavedCodeSize = nblocks * raft::neighbors::ivf_flat::kIndexGroupSize * dim * sizeof(elem_t);
-    size_t flat_code_size = list_size * dim * sizeof(elem_t);
-    std::vector<uint8_t> interleaved_codes(interleavedCodeSize);
-    std::vector<uint8_t> flat_codes(flat_code_size);
-
-    RaftIVFFlatCodePackerFlat p(resources_, interleavedCodeSize);
-    p.unpack_1(reinterpret_cast<const uint8_t*>(
-                    raft_knn_index.value().data_ptrs().data_handle()+listId), 0, interleaved_codes.data());
-    RaftIVFFlatCodePackerInterleaved up((size_t)list_size, (size_t)dim, (size_t)raft_knn_index.value().veclen());
-    up.unpack_1(interleaved_codes.data(), 0, flat_codes.data());
+    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(listSize);
+    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize);
+
+    std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
+    std::vector<uint8_t> flat_codes(cpuListSizeInBytes);
+
+     float* list_data_ptr;
+
+   // fetch the list data ptr on host
+    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft_handle.sync_stream();
+    printf("data ptr fetched successfully\n");
+
+    raft::update_host(interleaved_codes.data(), reinterpret_cast<uint8_t*>(list_data_ptr), gpuListSizeInBytes, stream);
+    raft_handle.sync_stream();
+    RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen());
+    packer.unpack_all(interleaved_codes.data(), flat_codes.data());
     return flat_codes;
 }
 
@@ -265,6 +274,10 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     raft::neighbors::ivf_flat::index_params pams;
     pams.add_data_on_build = false;
 
+    pams.n_lists = this->numLists_;
+
+    printf("numLists %d", pams.n_lists);
+
     switch (this->metric_) {
         case faiss::METRIC_L2:
             printf("Using L2!\n");
@@ -280,15 +293,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     raft_knn_index.emplace(
             handle,
-            pams.metric,
-            (uint32_t)this->numLists_,
-            false,
-	    false,
+            pams,
             (uint32_t)this->dim_);
 
     printf("Reconstructing\n");
     // Copy (reconstructed) centroids over, rather than re-training
-    rmm::device_uvector<float> buf_dev(total_elems, stream);
+//     rmm::device_uvector<float> buf_dev(total_elems, stream);
     std::vector<float> buf_host(total_elems);
     quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
 
@@ -310,45 +320,66 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 //
 //
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+   printf("Inside raft's copyInvertedListsFrom\n");
    size_t nlist = ivf ? ivf->nlist : 0;
    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 
-   raft::device_resources &handle = resources_->getRaftHandleCurrentDevice();
+   raft::device_resources &raft_handle = resources_->getRaftHandleCurrentDevice();
 
    std::vector<std::uint32_t> list_sizes_(nlist);
-   std::vector<idx_t> list_offsets_(nlist+1);
    std::vector<idx_t> indices_(ntotal);
 
-   raft::neighbors::ivf_flat::index_params raft_idx_params;
-   raft_idx_params.n_lists = nlist;
-   raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-   raft_idx_params.add_data_on_build = false;
-   raft_idx_params.kmeans_n_iters = 100;
-//
-   raft_knn_index.emplace(handle, raft_idx_params, dim_);
+   // the index must already exist
+   FAISS_ASSERT(raft_knn_index.has_value());
+//    if(!raft_knn_index.has_value()) {
+//         printf("emplacing because index is null");
+//    raft::neighbors::ivf_flat::index_params raft_idx_params;
+//    raft_idx_params.n_lists = nlist;
+//    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+//    raft_idx_params.add_data_on_build = false;
+//    raft_idx_params.kmeans_n_iters = 100;
+
+//    raft_knn_index.emplace(handle, raft_idx_params, dim_);
+//    }
 //    raft_knn_index.value().allocate(handle, ntotal, true);
-//
+  auto& raft_lists = raft_knn_index.value().lists();
+
+  // conservative memory alloc for cloning cpu inverted lists
+  raft::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{static_cast<uint32_t>(dim_), true};
+
    for (size_t i = 0; i < nlist; ++i) {
-       size_t listSize = ivf->list_size(i);
-       list_sizes_[i] = listSize;
 
-       // GPU index can only support max int entries per list
+        size_t listSize = ivf->list_size(i);
+
+        // GPU index can only support max int entries per list
        FAISS_THROW_IF_NOT_FMT(
                listSize <= (size_t)std::numeric_limits<int>::max(),
                "GPU inverted list can only support "
                "%zu entries; %zu found",
                (size_t)std::numeric_limits<int>::max(),
                listSize);
+        
+        // store the list size
+        list_sizes_[i] = static_cast<uint32_t>(listSize);
+
+       raft::neighbors::ivf::resize_list(raft_handle,
+                        raft_lists[i],
+                       raft_list_spec,
+                       (uint32_t)(raft::Pow2<raft::neighbors::ivf_flat::kIndexGroupSize>::roundUp(listSize)),
+                       (uint32_t)0);
+        printf("listSize %d\n", listSize);
+   }
 
-       addEncodedVectorsToList_(
-               i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+  // Update the pointers and the sizes
+  raft_knn_index.value().recompute_internal_state(raft_handle);
+
+        for (size_t i = 0; i < nlist; ++i) {
+                size_t listSize = ivf->list_size(i);
+       addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize);
    }
-//
-   raft::update_device(raft_knn_index.value().list_sizes().data_handle(),
-   list_sizes_.data(), nlist, handle.get_stream());
-//    raft::update_device(raft_knn_index.value().list_offsets().data_handle(),
-//    list_offsets_.data(), nlist+1, handle.get_stream());
-//
+
+    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream());
+    raft_handle.sync_stream();
 }
 
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
@@ -373,6 +404,7 @@ void RaftIVFFlat::addEncodedVectorsToList_(
             const void* codes,
             const idx_t* indices,
             idx_t numVecs) {
+   printf("inside addEncodedVectorsToList_ for listId %d\n", listId);
    auto stream = resources_->getDefaultStreamCurrentDevice();
 
    // This list must already exist
@@ -389,6 +421,10 @@ void RaftIVFFlat::addEncodedVectorsToList_(
    // The GPU might have a different layout of the memory
    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
+   
+   printf("numVecs %d\n", numVecs);
+   printf("gpuListSizeInBytes %d\n", gpuListSizeInBytes);
+   printf("cpuListSizeInBytes %d\n", cpuListSizeInBytes);
 
   // We only have int32 length representations on the GPU per each
   // list; the length is in sizeof(char)
@@ -396,25 +432,40 @@ void RaftIVFFlat::addEncodedVectorsToList_(
    (size_t)std::numeric_limits<int>::max());
 
    // Translate the codes as needed to our preferred form
-   std::vector<uint8_t> codesV(cpuListSizeInBytes);
-   std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
+//    std::vector<uint8_t> codesV(cpuListSizeInBytes);
+//    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
 //    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
 
-   std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" <<
-   gpuListSizeInBytes << std::endl;
-//    utils::divUp(numVecs, 32);
-   RaftIVFFlatCodePackerInterleaved transform_packer((size_t)numVecs, (size_t)dim_, (size_t)raft_knn_index.value().veclen());
-   std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-   std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
-   transform_packer.pack_1(codesV.data(), 0, interleaved_codes.data());
-   RaftIVFFlatCodePackerFlat copy_packer(resources_, cpuListSizeInBytes);
-   copy_packer.unpack_1(interleaved_codes.data(), 0, (uint8_t*)(raft_knn_index.value().data_ptrs().data_handle() + listId));
+        std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
+printf("dim %d\n", dim_);
+printf("veclen %d\n", raft_knn_index.value().veclen());
+   RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen());
+   
+   printf("Allocated interleaved codes\n");
+   packer.pack_all(reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
+   printf("packing done\n");
 
-   uint32_t size = numVecs;
-   raft::update_device(raft_knn_index.value().list_sizes().data_handle() + listId, &size, 1, stream);
+   float* list_data_ptr;
+   const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+
+   // fetch the list data ptr on host
+    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft_handle.sync_stream();
+    printf("data ptr fetched successfully\n");
+   
+   raft::update_device(reinterpret_cast<uint8_t*>(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream);
+   raft_handle.sync_stream();
+   printf("copied to gpu\n");
 
     // Handle the indices as well
-    raft::update_device((idx_t*)(raft_knn_index.value().inds_ptrs().data_handle() + listId), indices, numVecs, stream);
+    idx_t* list_indices_ptr;
+
+    // fetch the list indices ptr on host
+    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+        raft_handle.sync_stream();
+    raft::update_device(list_indices_ptr, indices, numVecs, stream);
+    raft_handle.sync_stream();
+    printf("Done copying indices\n");
 }
 /// Copy all inverted lists from ourselves to a CPU representation
 void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
@@ -455,55 +506,34 @@ void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
 
 
 
-RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen) {
-    this->list_size = list_size;
+RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) {
     this->dim = dim;
-    this->veclen = veclen;
-    nvec = 1;
-    code_size = list_size * dim * sizeof(uint32_t);
-    block_size = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize);
+    this->chunk_size = chunk_size;
+    // NB: dim should be divisible by the number of 4 byte records in one chunk
+    FAISS_ASSERT(dim % chunk_size == 0);
+    nvec = list_size;
+    code_size = dim * 4;
+    block_size = utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize);
 }
 
 void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
-        FAISS_ASSERT(offset == 0);
-    raft::neighbors::ivf_flat::helpers::pack_host_interleaved(
-        flat_code,
-        block,
-        nvec,
+        printf("packing offset %zu\n", offset);
+    raft::neighbors::ivf_flat::codepacker::pack_1(
+        reinterpret_cast<const uint32_t*>(flat_code),
+        reinterpret_cast<uint32_t*>(block),
         dim,
-        veclen);
+        chunk_size,
+        static_cast<uint32_t>(offset));
 }
 
 void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const {
-        FAISS_ASSERT(offset == 0);
-    raft::neighbors::ivf_flat::helpers::unpack_host_interleaved(
-        block,
-        flat_code,
-        nvec,
+    raft::neighbors::ivf_flat::codepacker::unpack_1(
+        reinterpret_cast<const uint32_t*>(block),
+        reinterpret_cast<uint32_t*>(flat_code),
         dim,
-        veclen);
-}
-
-RaftIVFFlatCodePackerFlat::RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size) {
-    this->resources = resources_;
-    nvec = 1;
-    code_size = code_size;
-    block_size = code_size;
-}
-
-void RaftIVFFlatCodePackerFlat::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
-        FAISS_ASSERT(offset == 0);
-    const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice();
-    raft::update_device(block, flat_code, code_size * nvec, raft_handle.get_stream());
+        chunk_size,
+        static_cast<uint32_t>(offset));
 }
 
-void RaftIVFFlatCodePackerFlat::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const {
-        FAISS_ASSERT(offset == 0);
-    const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice();
-    raft::update_host(flat_code, block, code_size * nvec, raft_handle.get_stream());
-    raft_handle.sync_stream();
-}
-
-
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 8f81bd8c42..b85d503da3 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -130,28 +130,16 @@ class RaftIVFFlat : public IVFFlat {
 
 
 struct RaftIVFFlatCodePackerInterleaved : CodePacker {
-    RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen);
+    RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chuk_size);
     void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
             const final;
     void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
             const final;
 
     protected:
-        size_t list_size;
-        size_t veclen;
-        size_t dim;
+        uint32_t chunk_size;
+        uint32_t dim;
 };
-struct RaftIVFFlatCodePackerFlat : CodePacker {
-    RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size);
-    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
-            const final;
-    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
-            const final;
-    
-    protected:
-        GpuResources* resources;
-};
-
 
 } // namespace gpu
 } // namespace faiss

From 444c58d69af4882464e035d364a428859957449e Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 31 Jul 2023 16:51:21 -0700
Subject: [PATCH 65/87] Passing tests

---
 faiss/gpu/GpuIndexIVF.cu               |   3 +-
 faiss/gpu/impl/RaftIVFFlat.cu          | 145 +++++--------------------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |   6 -
 3 files changed, 31 insertions(+), 123 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index dfc9631f7e..3b21bddefd 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -457,7 +457,8 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
         raft_idx_params.add_data_on_build = false;
         raft_idx_params.kmeans_trainset_fraction = 1.0;
-        raft_idx_params.kmeans_n_iters = 100;
+        raft_idx_params.kmeans_n_iters = cp.niter;
+        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
 
         auto raft_index = raft::neighbors::ivf_flat::build(
                 raft_handle, raft_idx_params, x, n, (idx_t)d);
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index e543d0ff5f..3a0bc78aeb 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -87,7 +87,6 @@ void RaftIVFFlat::search(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices) {
-    printf("Inside RaftIVFFlat search()\n");
 
     // TODO: We probably don't want to ignore the coarse quantizer here...
 
@@ -114,7 +113,7 @@ void RaftIVFFlat::search(
     raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle,
 	    pams,
-            *raft_knn_index,
+            raft_knn_index.value(),
             queries_view,
             out_inds_view,
             out_dists_view);
@@ -130,9 +129,6 @@ idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
-    printf("Inside RaftIVFFlat addVectors()\n");
-
-    raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout);
 
     auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(
             vecs.data(), vecs.getSize(0), dim_);
@@ -142,7 +138,6 @@ idx_t RaftIVFFlat::addVectors(
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    printf("About to call extend on index\n");
     // TODO: We probably don't want to ignore the coarse quantizer here
 
     if (raft_knn_index.has_value()) {
@@ -154,20 +149,15 @@ idx_t RaftIVFFlat::addVectors(
                         inds_view),
 	        raft_knn_index.value()));
 
-    } else {
-        printf("Index has not been trained!\n");
     }
-    printf("Done.\n");
     return vecs.getSize(0);
 }
 
 void RaftIVFFlat::reset() {
-    printf("Inside RaftIVFFlat reset()\n");
     raft_knn_index.reset();
 }
 
 idx_t RaftIVFFlat::getListLength(idx_t listId) const {
-    printf("Inside RaftIVFFlat getListLength\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
@@ -185,7 +175,6 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const {
 
 /// Return the list indices of a particular list back to the CPU
 std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
-    printf("Inside RaftIVFFlat getListIndices\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
@@ -199,7 +188,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     idx_t* list_indices_ptr;
 
     // fetch the list indices ptr on host
-    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);
     raft_handle.sync_stream();
 
     raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
@@ -210,15 +199,12 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 /// Return the encoded vectors of a particular list back to the CPU
 std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat)
         const {
-    printf("Inside RaftIVFFlat getListVectorData\n");
 
     FAISS_ASSERT(raft_knn_index.has_value());
 
     const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
 
-    std::cout << "Calling getListVectorData for " << listId << std::endl;
-
     idx_t listSize = getListLength(listId);
 
     // the interleaved block can be slightly larger than the list size (it's
@@ -229,15 +215,15 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat
     std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
     std::vector<uint8_t> flat_codes(cpuListSizeInBytes);
 
-     float* list_data_ptr;
+    float* list_data_ptr;
 
    // fetch the list data ptr on host
-    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);
     raft_handle.sync_stream();
-    printf("data ptr fetched successfully\n");
 
     raft::update_host(interleaved_codes.data(), reinterpret_cast<uint8_t*>(list_data_ptr), gpuListSizeInBytes, stream);
     raft_handle.sync_stream();
+
     RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen());
     packer.unpack_all(interleaved_codes.data(), flat_codes.data());
     return flat_codes;
@@ -254,16 +240,12 @@ void RaftIVFFlat::searchPreassigned(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
-    printf("Inside RaftIVFFlat searchPreassigned\n");
-
     // TODO: Fill this in!
 }
 
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     idx_t quantizer_ntotal = quantizer->ntotal;
 
-    std::cout << "Calling RAFT updateQuantizer with trained index with "
-              << quantizer_ntotal << " items" << std::endl;
     const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice();
     auto stream = handle.get_stream();
 
@@ -276,15 +258,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
 
     pams.n_lists = this->numLists_;
 
-    printf("numLists %d", pams.n_lists);
-
     switch (this->metric_) {
         case faiss::METRIC_L2:
-            printf("Using L2!\n");
             pams.metric = raft::distance::DistanceType::L2Expanded;
             break;
         case faiss::METRIC_INNER_PRODUCT:
-            printf("Using Inner product!\n");
             pams.metric = raft::distance::DistanceType::InnerProduct;
             break;
         default:
@@ -296,31 +274,20 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             pams,
             (uint32_t)this->dim_);
 
-    printf("Reconstructing\n");
-    // Copy (reconstructed) centroids over, rather than re-training
-//     rmm::device_uvector<float> buf_dev(total_elems, stream);
+    /// Copy (reconstructed) centroids over, rather than re-training
     std::vector<float> buf_host(total_elems);
     quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
 
-    printf("Copying...\n");
-
     raft::update_device(
             raft_knn_index.value().centers().data_handle(),
             buf_host.data(),
             total_elems,
             stream);
-
-    raft::print_device_vector(
-            "raft centers",
-            raft_knn_index.value().centers().data_handle(),
-            this->dim_,
-            std::cout);
 }
 
 //
 //
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-   printf("Inside raft's copyInvertedListsFrom\n");
    size_t nlist = ivf ? ivf->nlist : 0;
    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 
@@ -331,17 +298,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
 
    // the index must already exist
    FAISS_ASSERT(raft_knn_index.has_value());
-//    if(!raft_knn_index.has_value()) {
-//         printf("emplacing because index is null");
-//    raft::neighbors::ivf_flat::index_params raft_idx_params;
-//    raft_idx_params.n_lists = nlist;
-//    raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-//    raft_idx_params.add_data_on_build = false;
-//    raft_idx_params.kmeans_n_iters = 100;
-
-//    raft_knn_index.emplace(handle, raft_idx_params, dim_);
-//    }
-//    raft_knn_index.value().allocate(handle, ntotal, true);
+
   auto& raft_lists = raft_knn_index.value().lists();
 
   // conservative memory alloc for cloning cpu inverted lists
@@ -365,21 +322,32 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
        raft::neighbors::ivf::resize_list(raft_handle,
                         raft_lists[i],
                        raft_list_spec,
-                       (uint32_t)(raft::Pow2<raft::neighbors::ivf_flat::kIndexGroupSize>::roundUp(listSize)),
+                       (uint32_t)listSize,
                        (uint32_t)0);
-        printf("listSize %d\n", listSize);
    }
 
   // Update the pointers and the sizes
   raft_knn_index.value().recompute_internal_state(raft_handle);
 
         for (size_t i = 0; i < nlist; ++i) {
-                size_t listSize = ivf->list_size(i);
-       addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize);
-   }
+            size_t listSize = ivf->list_size(i);
+            addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+        }
 
     raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream());
-    raft_handle.sync_stream();
+
+        // Precompute the centers vector norms for L2Expanded distance
+        if (this->metric_ == faiss::METRIC_L2) {
+            raft_knn_index.value().allocate_center_norms(raft_handle);
+            raft::linalg::rowNorm(raft_knn_index.value().center_norms()->data_handle(),
+                            raft_knn_index.value().centers().data_handle(),
+                            raft_knn_index.value().dim(),
+                            (uint32_t)nlist,
+                            raft::linalg::L2Norm,
+                            true,
+                            raft_handle.get_stream());
+        }
+        raft_handle.sync_stream();
 }
 
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
@@ -404,7 +372,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
             const void* codes,
             const idx_t* indices,
             idx_t numVecs) {
-   printf("inside addEncodedVectorsToList_ for listId %d\n", listId);
    auto stream = resources_->getDefaultStreamCurrentDevice();
 
    // This list must already exist
@@ -421,91 +388,37 @@ void RaftIVFFlat::addEncodedVectorsToList_(
    // The GPU might have a different layout of the memory
    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
-   
-   printf("numVecs %d\n", numVecs);
-   printf("gpuListSizeInBytes %d\n", gpuListSizeInBytes);
-   printf("cpuListSizeInBytes %d\n", cpuListSizeInBytes);
 
   // We only have int32 length representations on the GPU per each
   // list; the length is in sizeof(char)
    FAISS_ASSERT(gpuListSizeInBytes <=
    (size_t)std::numeric_limits<int>::max());
 
-   // Translate the codes as needed to our preferred form
-//    std::vector<uint8_t> codesV(cpuListSizeInBytes);
-//    std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
-//    auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs);
-
         std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-printf("dim %d\n", dim_);
-printf("veclen %d\n", raft_knn_index.value().veclen());
    RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen());
    
-   printf("Allocated interleaved codes\n");
    packer.pack_all(reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
-   printf("packing done\n");
 
    float* list_data_ptr;
    const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
 
-   // fetch the list data ptr on host
-    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+   /// fetch the list data ptr on host
+    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);
     raft_handle.sync_stream();
-    printf("data ptr fetched successfully\n");
    
    raft::update_device(reinterpret_cast<uint8_t*>(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream);
    raft_handle.sync_stream();
-   printf("copied to gpu\n");
 
-    // Handle the indices as well
+    /// Handle the indices as well
     idx_t* list_indices_ptr;
 
     // fetch the list indices ptr on host
-    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);  // Copy the pointer to the first array from device to host
+    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);
         raft_handle.sync_stream();
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
     raft_handle.sync_stream();
-    printf("Done copying indices\n");
-}
-/// Copy all inverted lists from ourselves to a CPU representation
-void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) {
-   printf("Inside RaftIVFFlat copyInvertedListsTo\n");
-
-    for (idx_t i = 0; i < numLists_; ++i) {
-        auto listIndices = getListIndices(i);
-        auto listData = getListVectorData(i, false);
-
-        ivf->add_entries(
-                i, listIndices.size(), listIndices.data(), listData.data());
-    }
 }
 
-// std::vector<uint8_t> RaftIVFFlat::translateCodesToGpu_(
-//         std::vector<uint8_t> codes,
-//         std::vector<uint8_t> block,
-//         idx_t numVecs) const {
-//     if (!interleavedLayout_) {
-//         // same format
-//         return codes;
-//     }
-//     RaftIVFFlatCodePackerInterleaved packer;
-//     packer::pack_all(codes.data(), block.data());
-// }
-
-// std::vector<uint8_t> RaftIVFFlat::translateCodesFromGpu_(
-//         std::vector<uint8_t> codes,
-//         idx_t numVecs) const {
-//     if (!interleavedLayout_) {
-//         // same format
-//         return codes;
-//     }
-
-//     RaftIVFFlatCodePackerFlat packer;
-//     packer::unpack_all(block.data(), codes.data());
-// }
-
-
-
 RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) {
     this->dim = dim;
     this->chunk_size = chunk_size;
@@ -517,7 +430,7 @@ RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_s
 }
 
 void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
-        printf("packing offset %zu\n", offset);
+        // printf("packing offset %zu\n", offset);
     raft::neighbors::ivf_flat::codepacker::pack_1(
         reinterpret_cast<const uint32_t*>(flat_code),
         reinterpret_cast<uint32_t*>(block),
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index ba8638f010..a0f414a773 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -139,12 +139,6 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        printf("original add vectors: [");
-        for (int i = 0; i < 50; ++i) {
-            printf("%f, ", addVecs[i]);
-        }
-        printf("]\n");
-
         faiss::IndexFlatL2 quantizerL2(opt.dim);
         faiss::IndexFlatIP quantizerIP(opt.dim);
         faiss::Index* quantizer = metricType == faiss::METRIC_L2

From f148f09a78088a2a26fe788c9c4f26c92d358e9e Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 3 Aug 2023 17:28:13 -0700
Subject: [PATCH 66/87] Passing copyTo

---
 build.sh                               |  1 +
 faiss/gpu/GpuIndex.cu                  |  2 ++
 faiss/gpu/GpuIndexIVF.cu               | 47 ++++++--------------------
 faiss/gpu/GpuIndexIVFFlat.cu           | 32 +++++++++++++++---
 faiss/gpu/impl/RaftIVFFlat.cu          | 15 ++++++--
 faiss/gpu/impl/RaftIVFFlat.cuh         |  8 +++--
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 32 +++++++++++-------
 7 files changed, 78 insertions(+), 59 deletions(-)

diff --git a/build.sh b/build.sh
index 5a0c3c58da..6a353379f8 100755
--- a/build.sh
+++ b/build.sh
@@ -16,6 +16,7 @@ fi
 
 if [ "$1" == "clean" ]; then
   rm -rf build
+  rm -rf .cache
   exit 0
 fi
 
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 89952b1121..53a0179334 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -102,6 +102,8 @@ size_t GpuIndex::getMinPagingSize() const {
 
 void GpuIndex::add(idx_t n, const float* x) {
     // Pass to add_with_ids
+    printf("add called with n = %d\n", n);
+    raft::print_host_vector("x", x, 5, std::cout);
     add_with_ids(n, x, nullptr);
 }
 
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 3b21bddefd..159b3730cd 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -5,11 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <raft/core/handle.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
-
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/gpu/GpuCloner.h>
@@ -268,6 +263,7 @@ void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) {
     // Device is already set in GpuIndex::add
     FAISS_ASSERT(baseIndex_);
     FAISS_ASSERT(n > 0);
+    printf("addVectors called from gpuindexivf");
 
     // Data is already resident on the GPU
     Tensor<float, 2, true> data(const_cast<float*>(x), {n, this->d});
@@ -445,41 +441,18 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         return;
     }
 
-    printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
-
-    if (config_.use_raft) {
-        printf("Using raft to train quantizer for %d vectors\n", n);
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        raft::neighbors::ivf_flat::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-        raft_idx_params.add_data_on_build = false;
-        raft_idx_params.kmeans_trainset_fraction = 1.0;
-        raft_idx_params.kmeans_n_iters = cp.niter;
-        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
-
-        auto raft_index = raft::neighbors::ivf_flat::build(
-                raft_handle, raft_idx_params, x, n, (idx_t)d);
-
-        raft_handle.sync_stream();
-
-        // TODO: Validate this is all we need to do
-        quantizer->reset();
-        quantizer->train(nlist, raft_index.centers().data_handle());
-        quantizer->add(nlist, raft_index.centers().data_handle());
-
-    } else {
-        // leverage the CPU-side k-means code, which works for the GPU
-        // flat index as well
-        quantizer->reset();
-        Clustering clus(this->d, nlist, this->cp);
-        clus.verbose = verbose;
-        clus.train(n, x, *quantizer);
+    if (this->verbose) {
+        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
     }
 
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    quantizer->reset();
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
     quantizer->is_trained = true;
+
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
 
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 7d2a94d595..997e4bfa1c 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -18,6 +18,11 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+
 #include <limits>
 
 namespace faiss {
@@ -85,8 +90,6 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
                 ivfFlatConfig_.interleavedLayout,
                 ivfFlatConfig_.indicesOptions,
                 config_.memorySpace);
-        baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-        updateQuantizer();
     }
 }
 
@@ -235,6 +238,7 @@ void GpuIndexIVFFlat::updateQuantizer() {
 }
 
 void GpuIndexIVFFlat::train(idx_t n, const float* x) {
+    printf("Inside train");
     DeviceScope scope(config_.device);
 
     // just in case someone changed our quantizer
@@ -250,12 +254,14 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     // FIXME: GPUize more of this
     // First, make sure that the data is resident on the CPU, if it is not on
     // the CPU, as we depend upon parts of the CPU code
+    if (!config_.use_raft) {
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
             {n, this->d});
 
     trainQuantizer_(n, hostData.data());
+    }
 
     // The quantizer is now trained; construct the IVF index
     set_index_(
@@ -269,11 +275,29 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
             ivfFlatConfig_.interleavedLayout,
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
-
-    if (reserveMemoryVecs_) {
+    
+    if (!config_.use_raft && reserveMemoryVecs_) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
 
+    if (config_.use_raft) {
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
+        raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_trainset_fraction = 1.0;
+        raft_idx_params.kmeans_n_iters = cp.niter;
+        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
+
+        printf("raft_idx_params.k_means_n_iters %u\n", cp.niter);
+
+        std::dynamic_pointer_cast<RaftIVFFlat>(index_)->set_index_(std::make_optional<raft::neighbors::ivf_flat::index<float, idx_t>>(raft::neighbors::ivf_flat::build(
+                raft_handle, raft_idx_params, x, n, (idx_t)d)));
+    }
+
     this->is_trained = true;
 }
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 3a0bc78aeb..3bab3b67fa 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -130,17 +130,25 @@ idx_t RaftIVFFlat::addVectors(
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
 
+    raft::print_device_vector("vecs", vecs.data(), 5, std::cout);
+    raft::print_device_vector("indices", indices.data(), indices.getSize(0), std::cout);
+
     auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(
             vecs.data(), vecs.getSize(0), dim_);
     auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(
             indices.data(), (idx_t)indices.getSize(0));
 
+
+    printf("vecs.getSize(0) %d", vecs.getSize(0));
+    printf("indices.getSize(0) %d", indices.getSize(0));
+
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     // TODO: We probably don't want to ignore the coarse quantizer here
 
-    if (raft_knn_index.has_value()) {
+    FAISS_ASSERT(raft_knn_index.has_value());
+//     cudaMemcpyAsync(raft_knn_index.value().centers().data_handle(), coarseQuantizer.codes.data(), raft_knn_index.value().n_lists() * dim_ * sizeof(float), cudaMemcpyDefault, raft_handle.get_stream());
         raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
                 raft_handle,
                 vecs_view,
@@ -149,7 +157,6 @@ idx_t RaftIVFFlat::addVectors(
                         inds_view),
 	        raft_knn_index.value()));
 
-    }
     return vecs.getSize(0);
 }
 
@@ -350,6 +357,10 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
         raft_handle.sync_stream();
 }
 
+void RaftIVFFlat::set_index_(std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx) {
+        raft_knn_index.emplace(std::move(idx.value()));
+}
+
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
         idx_t bits = 32 /* float */;
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index b85d503da3..30a7378570 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -84,6 +84,9 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& vecs,
             Tensor<idx_t, 1, true>& indices) override;
 
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+//     void reserveMemory(idx_t numVecs) override;
+
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
     void reset() override;
@@ -103,10 +106,9 @@ class RaftIVFFlat : public IVFFlat {
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
-    
-    /// Copy all inverted lists from ourselves to a CPU representation
-    void copyInvertedListsTo(InvertedLists* ivf) override;
 
+    void set_index_(std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx);
+    
    protected:
        /// Adds a set of codes and indices to a list, with the representation
        /// coming from the CPU equivalent
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index a0f414a773..e84fbd665c 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -232,6 +232,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
+
 void copyFromTest(bool useFloat16CoarseQuantizer) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
@@ -399,6 +400,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -447,6 +449,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -485,6 +488,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -505,19 +509,19 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
 
     // should not crash
     EXPECT_EQ(gpuIndex.ntotal, 0);
-    gpuIndex.add(numNans, nans.data());
-
-    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-    std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-    // should not crash
-    gpuIndex.search(
-            opt.numQuery,
-            queryVecs.data(),
-            opt.k,
-            distance.data(),
-            indices.data());
+    // gpuIndex.add(numNans, nans.data());
+
+    // std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+    // std::vector<float> distance(opt.numQuery * opt.k, 0);
+    // std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
+
+    // // should not crash
+    // gpuIndex.search(
+    //         opt.numQuery,
+    //         queryVecs.data(),
+    //         opt.k,
+    //         distance.data(),
+    //         indices.data());
 }
 
 TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
@@ -558,6 +562,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -615,6 +620,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
+    config.use_raft = true;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);

From 575650864a782368a1f54ce10eb7cfb2c2b53d9b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 16:37:50 -0700
Subject: [PATCH 67/87] All tests passing

---
 faiss/gpu/GpuIndexIVF.cu               |   1 -
 faiss/gpu/GpuIndexIVFFlat.cu           |   1 -
 faiss/gpu/impl/IVFFlat.cuh             |   2 +-
 faiss/gpu/impl/RaftIVFFlat.cu          | 462 ++++++++++++++++---------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |  26 +-
 5 files changed, 310 insertions(+), 182 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 159b3730cd..f2ed323605 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -263,7 +263,6 @@ void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) {
     // Device is already set in GpuIndex::add
     FAISS_ASSERT(baseIndex_);
     FAISS_ASSERT(n > 0);
-    printf("addVectors called from gpuindexivf");
 
     // Data is already resident on the GPU
     Tensor<float, 2, true> data(const_cast<float*>(x), {n, this->d});
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 997e4bfa1c..3458177dd3 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -76,7 +76,6 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
           reserveMemoryVecs_(0) {
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
-
     if (this->is_trained) {
         FAISS_ASSERT(this->quantizer);
         set_index_(
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 1c3048490e..726d62c1da 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -7,7 +7,7 @@
 
 #pragma once
 
-#ifdef FAISS_ENABLE_RAFT
+#if defined USE_NVIDIA_RAFT
 #include <raft/core/handle.hpp>
 #include <raft/spatial/knn/ivf_flat_types.hpp>
 #endif
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 3bab3b67fa..9a08fb7d51 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -21,6 +21,8 @@
  */
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
+#include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <raft/neighbors/ivf_flat.cuh>
 
@@ -45,7 +47,6 @@
 #include <limits>
 #include <unordered_map>
 
-
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_flat_helpers.cuh>
 
@@ -87,12 +88,11 @@ void RaftIVFFlat::search(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices) {
-
     // TODO: We probably don't want to ignore the coarse quantizer here...
 
-    std::uint32_t n = queries.getSize(0);
-    std::uint32_t cols = queries.getSize(1);
-    std::uint32_t k_ = k;
+    uint32_t n = queries.getSize(0);
+    uint32_t cols = queries.getSize(1);
+    uint32_t k_ = k;
 
     // Device is already set in GpuIndex::search
     FAISS_ASSERT(raft_knn_index.has_value());
@@ -104,19 +104,66 @@ void RaftIVFFlat::search(
     raft::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
-    auto queries_view =
-            raft::make_device_matrix_view<const float>(queries.data(), n, cols);
+    uint32_t n_rows = n;
+
+    auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
+
+    thrust::fill_n(
+            raft_handle.get_thrust_policy(),
+            nan_flag.data_handle(),
+            n_rows,
+            true);
+    raft::linalg::map_offset(
+            raft_handle,
+            nan_flag.view(),
+            [queries = queries.data(), dim_ = this->dim_] __device__(idx_t i) {
+                for (idx_t col = 0; col < dim_; col++) {
+                    if (!isfinite(queries[i * dim_ + col])) {
+                        return false;
+                    }
+                }
+                return true;
+            });
+
+    // TODO: We probably don't want to ignore the coarse quantizer here
+
+    auto queries_view = raft::make_device_matrix_view<const float>(
+            queries.data(), n_rows, cols);
     auto out_inds_view =
-            raft::make_device_matrix_view<idx_t>(outIndices.data(), n, k_);
-    auto out_dists_view =
-            raft::make_device_matrix_view<float>(outDistances.data(), n, k_);
+            raft::make_device_matrix_view<idx_t>(outIndices.data(), n_rows, k_);
+    auto out_dists_view = raft::make_device_matrix_view<float>(
+            outDistances.data(), n_rows, k_);
     raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle,
-	    pams,
+            pams,
             raft_knn_index.value(),
             queries_view,
             out_inds_view,
             out_dists_view);
+    float max_val = std::numeric_limits<float>::max();
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outIndices.data(), n_rows * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_inds = outIndices.data(),
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return idx_t(-1);
+                return out_inds[i];
+            });
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outDistances.data(), n_rows * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_dists = outDistances.data(),
+             max_val,
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return max_val;
+                return out_dists[i];
+            });
 
     raft_handle.sync_stream();
 }
@@ -129,35 +176,78 @@ idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
+    idx_t n_rows = vecs.getSize(0);
 
-    raft::print_device_vector("vecs", vecs.data(), 5, std::cout);
-    raft::print_device_vector("indices", indices.data(), indices.getSize(0), std::cout);
-
-    auto vecs_view = raft::make_device_matrix_view<const float, idx_t>(
-            vecs.data(), vecs.getSize(0), dim_);
-    auto inds_view = raft::make_device_vector_view<const idx_t, idx_t>(
-            indices.data(), (idx_t)indices.getSize(0));
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
+    /// Remove NaN values
+    auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
 
-    printf("vecs.getSize(0) %d", vecs.getSize(0));
-    printf("indices.getSize(0) %d", indices.getSize(0));
+    thrust::fill_n(
+            raft_handle.get_thrust_policy(),
+            nan_flag.data_handle(),
+            n_rows,
+            true);
+    raft::linalg::map_offset(
+            raft_handle,
+            nan_flag.view(),
+            [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) {
+                for (idx_t col = 0; col < dim_; col++) {
+                    if (!isfinite(vecs[i * dim_ + col])) {
+                        return false;
+                    }
+                }
+                return true;
+            });
+    raft_handle.sync_stream();
+    idx_t n_rows_valid = thrust::reduce(
+            raft_handle.get_thrust_policy(),
+            nan_flag.data_handle(),
+            nan_flag.data_handle() + n_rows,
+            0);
+    auto gather_indices =
+            raft::make_device_vector<idx_t, idx_t>(raft_handle, n_rows_valid);
+    auto count = thrust::make_counting_iterator(0);
+    thrust::copy_if(
+            raft_handle.get_thrust_policy(),
+            count,
+            count + n_rows,
+            gather_indices.data_handle(),
+            [nan_flag = nan_flag.data_handle()] __device__(auto i) {
+                return nan_flag[i];
+            });
+    if (n_rows_valid < n_rows) {
+        raft::matrix::gather(
+                raft_handle,
+                raft::make_device_matrix_view<float, idx_t>(
+                        vecs.data(), n_rows, dim_),
+                raft::make_const_mdspan(gather_indices.view()),
+                (idx_t)16);
+    }
+    auto valid_indices =
+            raft::make_device_vector<idx_t, idx_t>(raft_handle, n_rows);
 
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
+    raft::matrix::gather(
+            raft_handle,
+            raft::make_device_matrix_view<const idx_t>(
+                    indices.data(), n_rows, (idx_t)1),
+            raft::make_const_mdspan(gather_indices.view()),
+            raft::make_device_matrix_view(
+                    valid_indices.data_handle(), n_rows_valid, (idx_t)1));
 
-    // TODO: We probably don't want to ignore the coarse quantizer here
+    /// TODO: We probably don't want to ignore the coarse quantizer here
 
     FAISS_ASSERT(raft_knn_index.has_value());
-//     cudaMemcpyAsync(raft_knn_index.value().centers().data_handle(), coarseQuantizer.codes.data(), raft_knn_index.value().n_lists() * dim_ * sizeof(float), cudaMemcpyDefault, raft_handle.get_stream());
-        raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
-                raft_handle,
-                vecs_view,
-                std::make_optional<
-                        raft::device_vector_view<const idx_t, idx_t>>(
-                        inds_view),
-	        raft_knn_index.value()));
+    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
+            raft_handle,
+            raft::make_device_matrix_view<const float, idx_t>(
+                    vecs.data(), n_rows_valid, dim_),
+            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
+                    valid_indices.view()),
+            raft_knn_index.value()));
 
-    return vecs.getSize(0);
+    return n_rows_valid;
 }
 
 void RaftIVFFlat::reset() {
@@ -165,7 +255,6 @@ void RaftIVFFlat::reset() {
 }
 
 idx_t RaftIVFFlat::getListLength(idx_t listId) const {
-
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -182,7 +271,6 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const {
 
 /// Return the list indices of a particular list back to the CPU
 std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
-
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -195,7 +283,11 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     idx_t* list_indices_ptr;
 
     // fetch the list indices ptr on host
-    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);
+    raft::update_host(
+            &list_indices_ptr,
+            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            1,
+            stream);
     raft_handle.sync_stream();
 
     raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
@@ -204,12 +296,13 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat)
-        const {
-
+std::vector<uint8_t> RaftIVFFlat::getListVectorData(
+        idx_t listId,
+        bool gpuFormat) const {
     FAISS_ASSERT(raft_knn_index.has_value());
 
-    const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
 
     idx_t listSize = getListLength(listId);
@@ -224,14 +317,23 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat
 
     float* list_data_ptr;
 
-   // fetch the list data ptr on host
-    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);
+    // fetch the list data ptr on host
+    raft::update_host(
+            &list_data_ptr,
+            raft_knn_index.value().data_ptrs().data_handle() + listId,
+            1,
+            stream);
     raft_handle.sync_stream();
 
-    raft::update_host(interleaved_codes.data(), reinterpret_cast<uint8_t*>(list_data_ptr), gpuListSizeInBytes, stream);
+    raft::update_host(
+            interleaved_codes.data(),
+            reinterpret_cast<uint8_t*>(list_data_ptr),
+            gpuListSizeInBytes,
+            stream);
     raft_handle.sync_stream();
 
-    RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen());
+    RaftIVFFlatCodePackerInterleaved packer(
+            (size_t)listSize, dim_, raft_knn_index.value().veclen());
     packer.unpack_all(interleaved_codes.data(), flat_codes.data());
     return flat_codes;
 }
@@ -253,7 +355,8 @@ void RaftIVFFlat::searchPreassigned(
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     idx_t quantizer_ntotal = quantizer->ntotal;
 
-    const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice();
+    const raft::device_resources& handle =
+            resources_->getRaftHandleCurrentDevice();
     auto stream = handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
@@ -276,10 +379,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft_knn_index.emplace(
-            handle,
-            pams,
-            (uint32_t)this->dim_);
+    raft_knn_index.emplace(handle, pams, (uint32_t)this->dim_);
 
     /// Copy (reconstructed) centroids over, rather than re-training
     std::vector<float> buf_host(total_elems);
@@ -292,171 +392,201 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             stream);
 }
 
-//
-//
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
-   size_t nlist = ivf ? ivf->nlist : 0;
-   size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
-
-   raft::device_resources &raft_handle = resources_->getRaftHandleCurrentDevice();
+    size_t nlist = ivf ? ivf->nlist : 0;
+    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 
-   std::vector<std::uint32_t> list_sizes_(nlist);
-   std::vector<idx_t> indices_(ntotal);
+    raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
-   // the index must already exist
-   FAISS_ASSERT(raft_knn_index.has_value());
+    std::vector<uint32_t> list_sizes_(nlist);
+    std::vector<idx_t> indices_(ntotal);
 
-  auto& raft_lists = raft_knn_index.value().lists();
+    // the index must already exist
+    FAISS_ASSERT(raft_knn_index.has_value());
 
-  // conservative memory alloc for cloning cpu inverted lists
-  raft::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{static_cast<uint32_t>(dim_), true};
+    auto& raft_lists = raft_knn_index.value().lists();
 
-   for (size_t i = 0; i < nlist; ++i) {
+    // conservative memory alloc for cloning cpu inverted lists
+    raft::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{
+            static_cast<uint32_t>(dim_), true};
 
+    for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);
 
         // GPU index can only support max int entries per list
-       FAISS_THROW_IF_NOT_FMT(
-               listSize <= (size_t)std::numeric_limits<int>::max(),
-               "GPU inverted list can only support "
-               "%zu entries; %zu found",
-               (size_t)std::numeric_limits<int>::max(),
-               listSize);
-        
+        FAISS_THROW_IF_NOT_FMT(
+                listSize <= (size_t)std::numeric_limits<int>::max(),
+                "GPU inverted list can only support "
+                "%zu entries; %zu found",
+                (size_t)std::numeric_limits<int>::max(),
+                listSize);
+
         // store the list size
         list_sizes_[i] = static_cast<uint32_t>(listSize);
 
-       raft::neighbors::ivf::resize_list(raft_handle,
-                        raft_lists[i],
-                       raft_list_spec,
-                       (uint32_t)listSize,
-                       (uint32_t)0);
-   }
-
-  // Update the pointers and the sizes
-  raft_knn_index.value().recompute_internal_state(raft_handle);
-
-        for (size_t i = 0; i < nlist; ++i) {
-            size_t listSize = ivf->list_size(i);
-            addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize);
-        }
-
-    raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream());
-
-        // Precompute the centers vector norms for L2Expanded distance
-        if (this->metric_ == faiss::METRIC_L2) {
-            raft_knn_index.value().allocate_center_norms(raft_handle);
-            raft::linalg::rowNorm(raft_knn_index.value().center_norms()->data_handle(),
-                            raft_knn_index.value().centers().data_handle(),
-                            raft_knn_index.value().dim(),
-                            (uint32_t)nlist,
-                            raft::linalg::L2Norm,
-                            true,
-                            raft_handle.get_stream());
-        }
-        raft_handle.sync_stream();
+        raft::neighbors::ivf::resize_list(
+                raft_handle,
+                raft_lists[i],
+                raft_list_spec,
+                (uint32_t)listSize,
+                (uint32_t)0);
+    }
+
+    // Update the pointers and the sizes
+    raft_knn_index.value().recompute_internal_state(raft_handle);
+
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+        addEncodedVectorsToList_(
+                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+    }
+
+    raft::update_device(
+            raft_knn_index.value().list_sizes().data_handle(),
+            list_sizes_.data(),
+            nlist,
+            raft_handle.get_stream());
+
+    // Precompute the centers vector norms for L2Expanded distance
+    if (this->metric_ == faiss::METRIC_L2) {
+        raft_knn_index.value().allocate_center_norms(raft_handle);
+        raft::linalg::rowNorm(
+                raft_knn_index.value().center_norms()->data_handle(),
+                raft_knn_index.value().centers().data_handle(),
+                raft_knn_index.value().dim(),
+                (uint32_t)nlist,
+                raft::linalg::L2Norm,
+                true,
+                raft_handle.get_stream());
+    }
+    raft_handle.sync_stream();
 }
 
-void RaftIVFFlat::set_index_(std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx) {
-        raft_knn_index.emplace(std::move(idx.value()));
+void RaftIVFFlat::set_index_(
+        std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx) {
+    raft_knn_index.emplace(std::move(idx.value()));
 }
 
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
-        idx_t bits = 32 /* float */;
+    idx_t bits = 32 /* float */;
 
-        // bytes to encode a block of 32 vectors (single dimension)
-        idx_t bytesPerDimBlock = bits * 32 / 8; // = 128
+    // bytes to encode a block of 32 vectors (single dimension)
+    idx_t bytesPerDimBlock = bits * 32 / 8; // = 128
 
-        // bytes to fully encode 32 vectors
-        idx_t bytesPerBlock = bytesPerDimBlock * dim_;
+    // bytes to fully encode 32 vectors
+    idx_t bytesPerBlock = bytesPerDimBlock * dim_;
 
-        // number of blocks of 32 vectors we have
-        idx_t numBlocks = utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize);
+    // number of blocks of 32 vectors we have
+    idx_t numBlocks =
+            utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize);
 
-        // total size to encode numVecs
-        return bytesPerBlock * numBlocks;
+    // total size to encode numVecs
+    return bytesPerBlock * numBlocks;
 }
 
-
 void RaftIVFFlat::addEncodedVectorsToList_(
-            idx_t listId,
-            const void* codes,
-            const idx_t* indices,
-            idx_t numVecs) {
-   auto stream = resources_->getDefaultStreamCurrentDevice();
-
-   // This list must already exist
-   FAISS_ASSERT(raft_knn_index.has_value());
-
-   // This list must currently be empty
-   FAISS_ASSERT(getListLength(listId) == 0);
-
-   // If there's nothing to add, then there's nothing we have to do
-   if (numVecs == 0) {
-       return;
-   }
-
-   // The GPU might have a different layout of the memory
-   auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-   auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
-
-  // We only have int32 length representations on the GPU per each
-  // list; the length is in sizeof(char)
-   FAISS_ASSERT(gpuListSizeInBytes <=
-   (size_t)std::numeric_limits<int>::max());
-
-        std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-   RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen());
-   
-   packer.pack_all(reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
-
-   float* list_data_ptr;
-   const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
-
-   /// fetch the list data ptr on host
-    raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream);
+        idx_t listId,
+        const void* codes,
+        const idx_t* indices,
+        idx_t numVecs) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    // This list must already exist
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    // This list must currently be empty
+    FAISS_ASSERT(getListLength(listId) == 0);
+
+    // If there's nothing to add, then there's nothing we have to do
+    if (numVecs == 0) {
+        return;
+    }
+
+    // The GPU might have a different layout of the memory
+    auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
+    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
+
+    // We only have int32 length representations on the GPU per each
+    // list; the length is in sizeof(char)
+    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+
+    std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
+    RaftIVFFlatCodePackerInterleaved packer(
+            (size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen());
+
+    packer.pack_all(
+            reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
+
+    float* list_data_ptr;
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    /// fetch the list data ptr on host
+    raft::update_host(
+            &list_data_ptr,
+            raft_knn_index.value().data_ptrs().data_handle() + listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
+
+    raft::update_device(
+            reinterpret_cast<uint8_t*>(list_data_ptr),
+            interleaved_codes.data(),
+            gpuListSizeInBytes,
+            stream);
     raft_handle.sync_stream();
-   
-   raft::update_device(reinterpret_cast<uint8_t*>(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream);
-   raft_handle.sync_stream();
 
     /// Handle the indices as well
     idx_t* list_indices_ptr;
 
     // fetch the list indices ptr on host
-    raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream);
-        raft_handle.sync_stream();
+    raft::update_host(
+            &list_indices_ptr,
+            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
     raft_handle.sync_stream();
 }
 
-RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) {
+RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
+        size_t list_size,
+        uint32_t dim,
+        uint32_t chunk_size) {
     this->dim = dim;
     this->chunk_size = chunk_size;
     // NB: dim should be divisible by the number of 4 byte records in one chunk
     FAISS_ASSERT(dim % chunk_size == 0);
     nvec = list_size;
     code_size = dim * 4;
-    block_size = utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize);
+    block_size =
+            utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize);
 }
 
-void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const {
-        // printf("packing offset %zu\n", offset);
+void RaftIVFFlatCodePackerInterleaved::pack_1(
+        const uint8_t* flat_code,
+        size_t offset,
+        uint8_t* block) const {
     raft::neighbors::ivf_flat::codepacker::pack_1(
-        reinterpret_cast<const uint32_t*>(flat_code),
-        reinterpret_cast<uint32_t*>(block),
-        dim,
-        chunk_size,
-        static_cast<uint32_t>(offset));
+            reinterpret_cast<const uint32_t*>(flat_code),
+            reinterpret_cast<uint32_t*>(block),
+            dim,
+            chunk_size,
+            static_cast<uint32_t>(offset));
 }
 
-void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const {
+void RaftIVFFlatCodePackerInterleaved::unpack_1(
+        const uint8_t* block,
+        size_t offset,
+        uint8_t* flat_code) const {
     raft::neighbors::ivf_flat::codepacker::unpack_1(
-        reinterpret_cast<const uint32_t*>(block),
-        reinterpret_cast<uint32_t*>(flat_code),
-        dim,
-        chunk_size,
-        static_cast<uint32_t>(offset));
+            reinterpret_cast<const uint32_t*>(block),
+            reinterpret_cast<uint32_t*>(flat_code),
+            dim,
+            chunk_size,
+            static_cast<uint32_t>(offset));
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index e84fbd665c..1a79207a58 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -509,19 +509,19 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
 
     // should not crash
     EXPECT_EQ(gpuIndex.ntotal, 0);
-    // gpuIndex.add(numNans, nans.data());
-
-    // std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-    // std::vector<float> distance(opt.numQuery * opt.k, 0);
-    // std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-    // // should not crash
-    // gpuIndex.search(
-    //         opt.numQuery,
-    //         queryVecs.data(),
-    //         opt.k,
-    //         distance.data(),
-    //         indices.data());
+    gpuIndex.add(numNans, nans.data());
+
+    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+    std::vector<float> distance(opt.numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
+
+    // should not crash
+    gpuIndex.search(
+            opt.numQuery,
+            queryVecs.data(),
+            opt.k,
+            distance.data(),
+            indices.data());
 }
 
 TEST(TestGpuIndexIVFFlat, UnifiedMemory) {

From 82e979160ccefbebc9e53ad29ee06ad6cd99f3b6 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 16:53:49 -0700
Subject: [PATCH 68/87] cleanup

---
 faiss/gpu/GpuIndex.cu                  |   2 -
 faiss/gpu/impl/IVFFlat.cu              |   4 -
 faiss/gpu/test/TestGpuIndexFlat.cpp    | 171 ++++++++++++-------------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |   1 -
 4 files changed, 85 insertions(+), 93 deletions(-)

diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 53a0179334..89952b1121 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -102,8 +102,6 @@ size_t GpuIndex::getMinPagingSize() const {
 
 void GpuIndex::add(idx_t n, const float* x) {
     // Pass to add_with_ids
-    printf("add called with n = %d\n", n);
-    raft::print_host_vector("x", x, 5, std::cout);
     add_with_ids(n, x, nullptr);
 }
 
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 9fb5603a73..09ffcef2ac 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -92,10 +92,6 @@ std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
     }
 
     bool sc = scalarQ_ ? true : false;
-    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
-    std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc
-              << ", bitsPerCode=" << bitsPerCode
-              << ", interleavedLayout_=" << interleavedLayout_ << std::endl;
 
     auto up =
             unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index fd63af0589..2ab616caf4 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -663,93 +663,92 @@ TEST(TestRaftGpuIndexFlat, Reconstruct) {
 #endif
 
 void testSearchAndReconstruct(bool use_raft) {
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
-
-   size_t dim = 32;
-   size_t nb = 5000;
-   size_t nq = 10;
-   int k = 10;
-
-   auto xb = faiss::gpu::randVecs(nb, dim);
-   auto xq = faiss::gpu::randVecs(nq, dim);
-
-   faiss::IndexFlatL2 cpuIndex(dim);
-
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = device;
-   config.use_raft = use_raft;
-   faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-
-   cpuIndex.add(nb, xb.data());
-   gpuIndex.add(nb, xb.data());
-
-   std::vector<float> refDistance(nq * k, 0);
-   std::vector<faiss::idx_t> refIndices(nq * k, -1);
-   std::vector<float> refReconstruct(nq * k * dim, 0);
-   cpuIndex.search_and_reconstruct(
-           nq,
-           xq.data(),
-           k,
-           refDistance.data(),
-           refIndices.data(),
-           refReconstruct.data());
-
-   std::vector<float> testDistance(nq * k, 0);
-   std::vector<faiss::idx_t> testIndices(nq * k, -1);
-   std::vector<float> testReconstruct(nq * k * dim, 0);
-   gpuIndex.search_and_reconstruct(
-           nq,
-           xq.data(),
-           k,
-           testDistance.data(),
-           testIndices.data(),
-           testReconstruct.data());
-
-   // This handles the search results
-   faiss::gpu::compareLists(
-           refDistance.data(),
-           refIndices.data(),
-           testDistance.data(),
-           testIndices.data(),
-           nq,
-           k,
-           "SearchAndReconstruct",
-           true,
-           false,
-           true,
-           kF32MaxRelErr,
-           0.1f,
-           0.015f);
-
-   // As the search results may be slightly different (though compareLists
-   // above will ensure a decent number of matches), reconstruction should be
-   // the same for the vectors that do match
-   for (int i = 0; i < nq; ++i) {
-       std::unordered_map<faiss::idx_t, int> refLocation;
-
-       for (int j = 0; j < k; ++j) {
-           refLocation.insert(std::make_pair(refIndices[i * k + j], j));
-       }
-
-       for (int j = 0; j < k; ++j) {
-           auto idx = testIndices[i * k + j];
-           auto it = refLocation.find(idx);
-           if (it != refLocation.end()) {
-               for (int d = 0; d < dim; ++d) {
-                   EXPECT_EQ(
-                           refReconstruct[(i * k + it->second) * dim + d],
-                           testReconstruct[(i * k + j) * dim + d]);
-               }
-           }
-       }
-   }
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    size_t dim = 32;
+    size_t nb = 5000;
+    size_t nq = 10;
+    int k = 10;
+
+    auto xb = faiss::gpu::randVecs(nb, dim);
+    auto xq = faiss::gpu::randVecs(nq, dim);
+
+    faiss::IndexFlatL2 cpuIndex(dim);
+
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = device;
+    config.use_raft = use_raft;
+    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+
+    cpuIndex.add(nb, xb.data());
+    gpuIndex.add(nb, xb.data());
+
+    std::vector<float> refDistance(nq * k, 0);
+    std::vector<faiss::idx_t> refIndices(nq * k, -1);
+    std::vector<float> refReconstruct(nq * k * dim, 0);
+    cpuIndex.search_and_reconstruct(
+            nq,
+            xq.data(),
+            k,
+            refDistance.data(),
+            refIndices.data(),
+            refReconstruct.data());
+
+    std::vector<float> testDistance(nq * k, 0);
+    std::vector<faiss::idx_t> testIndices(nq * k, -1);
+    std::vector<float> testReconstruct(nq * k * dim, 0);
+    gpuIndex.search_and_reconstruct(
+            nq,
+            xq.data(),
+            k,
+            testDistance.data(),
+            testIndices.data(),
+            testReconstruct.data());
+
+    // This handles the search results
+    faiss::gpu::compareLists(
+            refDistance.data(),
+            refIndices.data(),
+            testDistance.data(),
+            testIndices.data(),
+            nq,
+            k,
+            "SearchAndReconstruct",
+            true,
+            false,
+            true,
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
+
+    // As the search results may be slightly different (though compareLists
+    // above will ensure a decent number of matches), reconstruction should be
+    // the same for the vectors that do match
+    for (int i = 0; i < nq; ++i) {
+        std::unordered_map<faiss::idx_t, int> refLocation;
+
+        for (int j = 0; j < k; ++j) {
+            refLocation.insert(std::make_pair(refIndices[i * k + j], j));
+        }
+
+        for (int j = 0; j < k; ++j) {
+            auto idx = testIndices[i * k + j];
+            auto it = refLocation.find(idx);
+            if (it != refLocation.end()) {
+                for (int d = 0; d < dim; ++d) {
+                    EXPECT_EQ(
+                            refReconstruct[(i * k + it->second) * dim + d],
+                            testReconstruct[(i * k + j) * dim + d]);
+                }
+            }
+        }
+    }
 }
-
 TEST(TestGpuIndexFlat, SearchAndReconstruct) {
    testSearchAndReconstruct(false);
 }
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 1a79207a58..2f2fd87cd1 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -232,7 +232,6 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-
 void copyFromTest(bool useFloat16CoarseQuantizer) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);

From 8486b9b1d45cd1920e1e15d560fd7d60ac8fe3c3 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:06:29 -0700
Subject: [PATCH 69/87] cleanup

---
 faiss/gpu/GpuIndexIVFFlat.cu | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 3458177dd3..3a67d2240c 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -11,13 +11,13 @@
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <raft/core/cudart_utils.hpp>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
+#include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
@@ -107,7 +107,6 @@ void GpuIndexIVFFlat::set_index_(
         IndicesOptions indicesOptions,
         MemorySpace space) {
     if (config_.use_raft) {
-        printf("Setting RaftIVFFlat index\n");
         index_.reset(new RaftIVFFlat(
                 resources,
                 dim,
@@ -237,7 +236,6 @@ void GpuIndexIVFFlat::updateQuantizer() {
 }
 
 void GpuIndexIVFFlat::train(idx_t n, const float* x) {
-    printf("Inside train");
     DeviceScope scope(config_.device);
 
     // just in case someone changed our quantizer
@@ -254,12 +252,12 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     // First, make sure that the data is resident on the CPU, if it is not on
     // the CPU, as we depend upon parts of the CPU code
     if (!config_.use_raft) {
-    auto hostData = toHost<float, 2>(
-            (float*)x,
-            resources_->getDefaultStream(config_.device),
-            {n, this->d});
+        auto hostData = toHost<float, 2>(
+                (float*)x,
+                resources_->getDefaultStream(config_.device),
+                {n, this->d});
 
-    trainQuantizer_(n, hostData.data());
+        trainQuantizer_(n, hostData.data());
     }
 
     // The quantizer is now trained; construct the IVF index
@@ -274,7 +272,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
             ivfFlatConfig_.interleavedLayout,
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
-    
+
     if (!config_.use_raft && reserveMemoryVecs_) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
@@ -291,10 +289,11 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         raft_idx_params.kmeans_n_iters = cp.niter;
         raft_idx_params.adaptive_centers = !cp.frozen_centroids;
 
-        printf("raft_idx_params.k_means_n_iters %u\n", cp.niter);
-
-        std::dynamic_pointer_cast<RaftIVFFlat>(index_)->set_index_(std::make_optional<raft::neighbors::ivf_flat::index<float, idx_t>>(raft::neighbors::ivf_flat::build(
-                raft_handle, raft_idx_params, x, n, (idx_t)d)));
+        std::dynamic_pointer_cast<RaftIVFFlat>(index_)->set_index_(
+                std::make_optional<
+                        raft::neighbors::ivf_flat::index<float, idx_t>>(
+                        raft::neighbors::ivf_flat::build(
+                                raft_handle, raft_idx_params, x, n, (idx_t)d)));
     }
 
     this->is_trained = true;

From 38215bc9cbe166337367ffddf51c997260f42082 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:15:03 -0700
Subject: [PATCH 70/87] cleanup

---
 faiss/gpu/impl/IVFBase.cu | 1 -
 faiss/gpu/impl/IVFFlat.cu | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 1e2f414fc0..890d489440 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -323,7 +323,6 @@ std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
 }
 
 void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
-    printf("inside ivf-flat's copyInvertedListsFrom\n");
     idx_t nlist = ivf ? ivf->nlist : 0;
     for (idx_t i = 0; i < nlist; ++i) {
         addEncodedVectorsToList_(
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 09ffcef2ac..376b98ef06 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -126,6 +126,7 @@ void IVFFlat::appendVectors_(
     //
     // Append the new encodings
     //
+
     // Append indices to the IVF lists
     runIVFIndicesAppend(
             listIds,

From ac678974a7059da7f7cdb6252c43507c747ea48b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:19:39 -0700
Subject: [PATCH 71/87] cleanup

---
 faiss/gpu/impl/IVFFlat.cu | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 376b98ef06..ac6f155aeb 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -91,8 +91,6 @@ std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
         return codes;
     }
 
-    bool sc = scalarQ_ ? true : false;
-
     auto up =
             unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
     return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
@@ -286,15 +284,9 @@ void IVFFlat::searchPreassigned(
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
 
-        /**
-         *
-         */
         Tensor<float, 2, true>& coarseDistances,
         Tensor<idx_t, 2, true>& coarseIndices,
 
-        /**
-         * This is raft::neighbors::ivf_flat::index::centers_
-         */
         Tensor<float, 3, true>& ivfCentroids,
         int k,
         Tensor<float, 2, true>& outDistances,

From 94817aa6e2f1382ff674a45b784f91f3764f6fd4 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:23:57 -0700
Subject: [PATCH 72/87] cleanup

---
 faiss/gpu/impl/IVFFlat.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index ac6f155aeb..ac06fd0156 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
         idx_t bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
 
         // bytes to encode a block of 32 vectors (single dimension)
-        idx_t bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32
+        idx_t bytesPerDimBlock = bits * 32 / 8;
 
         // bytes to fully encode 32 vectors
         idx_t bytesPerBlock = bytesPerDimBlock * dim_;

From 91b1e32e64c62cecde88db5208cf6c1e6841a1ba Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:26:17 -0700
Subject: [PATCH 73/87] cleanup

---
 faiss/gpu/impl/IVFFlat.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index ac06fd0156..4607e49870 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -91,6 +91,8 @@ std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
         return codes;
     }
 
+    int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32;
+
     auto up =
             unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode);
     return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode);
@@ -283,10 +285,8 @@ void IVFFlat::searchPreassigned(
 
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
-
         Tensor<float, 2, true>& coarseDistances,
         Tensor<idx_t, 2, true>& coarseIndices,
-
         Tensor<float, 3, true>& ivfCentroids,
         int k,
         Tensor<float, 2, true>& outDistances,

From 613ca7a0e71165ed28047e12c1a1b3f15d9c238c Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Aug 2023 17:48:08 -0700
Subject: [PATCH 74/87] cleanup

---
 faiss/gpu/GpuIndexIVFFlat.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 3a67d2240c..495fdc1dbf 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -146,7 +146,6 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 }
 
 void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    printf("Inside copyFrom\n");
     DeviceScope scope(config_.device);
 
     // This will copy GpuIndexIVF data such as the coarse quantizer

From c43c83f92df5047beb40b4b6e45d4123a508e212 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Aug 2023 13:59:28 -0700
Subject: [PATCH 75/87] Separate out nan filtering

---
 faiss/gpu/impl/RaftIVFFlat.cu  | 140 ++++++++++++++++-----------------
 faiss/gpu/impl/RaftIVFFlat.cuh |   2 +
 2 files changed, 69 insertions(+), 73 deletions(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 9a08fb7d51..63d4936743 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -90,13 +90,14 @@ void RaftIVFFlat::search(
         Tensor<idx_t, 2, true>& outIndices) {
     // TODO: We probably don't want to ignore the coarse quantizer here...
 
-    uint32_t n = queries.getSize(0);
+    uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
     uint32_t k_ = k;
 
     // Device is already set in GpuIndex::search
     FAISS_ASSERT(raft_knn_index.has_value());
-    FAISS_ASSERT(n > 0);
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
 
     const raft::device_resources& raft_handle =
@@ -104,35 +105,13 @@ void RaftIVFFlat::search(
     raft::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
-    uint32_t n_rows = n;
-
-    auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
-
-    thrust::fill_n(
-            raft_handle.get_thrust_policy(),
-            nan_flag.data_handle(),
-            n_rows,
-            true);
-    raft::linalg::map_offset(
-            raft_handle,
-            nan_flag.view(),
-            [queries = queries.data(), dim_ = this->dim_] __device__(idx_t i) {
-                for (idx_t col = 0; col < dim_; col++) {
-                    if (!isfinite(queries[i * dim_ + col])) {
-                        return false;
-                    }
-                }
-                return true;
-            });
-
-    // TODO: We probably don't want to ignore the coarse quantizer here
-
     auto queries_view = raft::make_device_matrix_view<const float>(
-            queries.data(), n_rows, cols);
-    auto out_inds_view =
-            raft::make_device_matrix_view<idx_t>(outIndices.data(), n_rows, k_);
+            queries.data(), numQueries, cols);
+    auto out_inds_view = raft::make_device_matrix_view<idx_t>(
+            outIndices.data(), numQueries, k_);
     auto out_dists_view = raft::make_device_matrix_view<float>(
-            outDistances.data(), n_rows, k_);
+            outDistances.data(), numQueries, k_);
+
     raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle,
             pams,
@@ -140,10 +119,15 @@ void RaftIVFFlat::search(
             queries_view,
             out_inds_view,
             out_dists_view);
-    float max_val = std::numeric_limits<float>::max();
+
+    /// Identify NaN rows and mask their nearest neighbors
+    auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
+
+    validRowIndices_(queries, nan_flag.data_handle());
+
     raft::linalg::map_offset(
             raft_handle,
-            raft::make_device_vector_view(outIndices.data(), n_rows * k_),
+            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
             [nan_flag = nan_flag.data_handle(),
              out_inds = outIndices.data(),
              k_] __device__(uint32_t i) {
@@ -152,9 +136,11 @@ void RaftIVFFlat::search(
                     return idx_t(-1);
                 return out_inds[i];
             });
+
+    float max_val = std::numeric_limits<float>::max();
     raft::linalg::map_offset(
             raft_handle,
-            raft::make_device_vector_view(outDistances.data(), n_rows * k_),
+            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
             [nan_flag = nan_flag.data_handle(),
              out_dists = outDistances.data(),
              max_val,
@@ -164,8 +150,6 @@ void RaftIVFFlat::search(
                     return max_val;
                 return out_dists[i];
             });
-
-    raft_handle.sync_stream();
 }
 
 /// Classify and encode/add vectors to our IVF lists.
@@ -184,57 +168,45 @@ idx_t RaftIVFFlat::addVectors(
     /// Remove NaN values
     auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
 
-    thrust::fill_n(
-            raft_handle.get_thrust_policy(),
-            nan_flag.data_handle(),
-            n_rows,
-            true);
-    raft::linalg::map_offset(
-            raft_handle,
-            nan_flag.view(),
-            [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) {
-                for (idx_t col = 0; col < dim_; col++) {
-                    if (!isfinite(vecs[i * dim_ + col])) {
-                        return false;
-                    }
-                }
-                return true;
-            });
-    raft_handle.sync_stream();
+    validRowIndices_(vecs, nan_flag.data_handle());
+
     idx_t n_rows_valid = thrust::reduce(
             raft_handle.get_thrust_policy(),
             nan_flag.data_handle(),
             nan_flag.data_handle() + n_rows,
             0);
-    auto gather_indices =
-            raft::make_device_vector<idx_t, idx_t>(raft_handle, n_rows_valid);
-    auto count = thrust::make_counting_iterator(0);
-    thrust::copy_if(
-            raft_handle.get_thrust_policy(),
-            count,
-            count + n_rows,
-            gather_indices.data_handle(),
-            [nan_flag = nan_flag.data_handle()] __device__(auto i) {
-                return nan_flag[i];
-            });
+
     if (n_rows_valid < n_rows) {
+        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        auto count = thrust::make_counting_iterator(0);
+
+        thrust::copy_if(
+                raft_handle.get_thrust_policy(),
+                count,
+                count + n_rows,
+                gather_indices.data_handle(),
+                [nan_flag = nan_flag.data_handle()] __device__(auto i) {
+                    return nan_flag[i];
+                });
+
         raft::matrix::gather(
                 raft_handle,
                 raft::make_device_matrix_view<float, idx_t>(
                         vecs.data(), n_rows, dim_),
                 raft::make_const_mdspan(gather_indices.view()),
                 (idx_t)16);
-    }
-    auto valid_indices =
-            raft::make_device_vector<idx_t, idx_t>(raft_handle, n_rows);
 
-    raft::matrix::gather(
-            raft_handle,
-            raft::make_device_matrix_view<const idx_t>(
-                    indices.data(), n_rows, (idx_t)1),
-            raft::make_const_mdspan(gather_indices.view()),
-            raft::make_device_matrix_view(
-                    valid_indices.data_handle(), n_rows_valid, (idx_t)1));
+        auto valid_indices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        raft::matrix::gather(
+                raft_handle,
+                raft::make_device_matrix_view<idx_t>(
+                        indices.data(), n_rows, (idx_t)1),
+                raft::make_const_mdspan(gather_indices.view()));
+    }
 
     /// TODO: We probably don't want to ignore the coarse quantizer here
 
@@ -244,7 +216,8 @@ idx_t RaftIVFFlat::addVectors(
             raft::make_device_matrix_view<const float, idx_t>(
                     vecs.data(), n_rows_valid, dim_),
             std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
-                    valid_indices.view()),
+                    raft::make_device_vector_view<const idx_t, idx_t>(
+                            indices.data(), n_rows_valid)),
             raft_knn_index.value()));
 
     return n_rows_valid;
@@ -551,6 +524,27 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     raft_handle.sync_stream();
 }
 
+void RaftIVFFlat::validRowIndices_(
+        Tensor<float, 2, true>& vecs,
+        bool* nan_flag) {
+    raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t n_rows = vecs.getSize(0);
+
+    thrust::fill_n(raft_handle.get_thrust_policy(), nan_flag, n_rows, true);
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view<bool, idx_t>(nan_flag, n_rows),
+            [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) {
+                for (idx_t col = 0; col < dim_; col++) {
+                    if (!isfinite(vecs[i * dim_ + col])) {
+                        return false;
+                    }
+                }
+                return true;
+            });
+}
+
 RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
         size_t list_size,
         uint32_t dim,
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 30a7378570..1cd2e18307 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -108,6 +108,8 @@ class RaftIVFFlat : public IVFFlat {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     void set_index_(std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx);
+
+    void validRowIndices_(Tensor<float, 2, true>& vecs, bool* nan_flag);
     
    protected:
        /// Adds a set of codes and indices to a list, with the representation

From 7eb5209d8c122273e1ed0dbae18fc90965aef364 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Aug 2023 16:38:35 -0700
Subject: [PATCH 76/87] Add USE_NVIDIA_RAFT

---
 faiss/gpu/GpuIndexIVFFlat.cu           |  30 +-
 faiss/gpu/StandardGpuResources.cpp     |   2 +
 faiss/gpu/impl/IVFFlat.cuh             |   5 -
 faiss/gpu/impl/RaftIVFFlat.cu          |  10 +-
 faiss/gpu/impl/RaftIVFFlat.cuh         |  34 +-
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 485 ++++++++++++++-----------
 6 files changed, 322 insertions(+), 244 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 495fdc1dbf..9422f6bc56 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -11,17 +11,18 @@
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_flat.cuh>
+#endif
 
 #include <limits>
 
@@ -106,6 +107,8 @@ void GpuIndexIVFFlat::set_index_(
         bool interleavedLayout,
         IndicesOptions indicesOptions,
         MemorySpace space) {
+#if defined USE_NVIDIA_RAFT
+
     if (config_.use_raft) {
         index_.reset(new RaftIVFFlat(
                 resources,
@@ -118,7 +121,14 @@ void GpuIndexIVFFlat::set_index_(
                 interleavedLayout,
                 indicesOptions,
                 space));
-    } else {
+    } else
+#else
+    if (config_.use_raft) {
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+    } else
+#endif
+    {
         index_.reset(new IVFFlat(
                 resources,
                 dim,
@@ -151,10 +161,14 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     // This will copy GpuIndexIVF data such as the coarse quantizer
     GpuIndexIVF::copyFrom(index);
 
+    printf("GpuIndexIVFcopyFrom done\n");
+
     // Clear out our old data
     index_.reset();
     baseIndex_.reset();
 
+    printf("indices reset\n");
+
     // The other index might not be trained
     if (!index->is_trained) {
         FAISS_ASSERT(!is_trained);
@@ -177,7 +191,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
 
-    // Copy all of the IVF data
+     // Copy all of the IVF data
     printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
     index_->copyInvertedListsFrom(index->invlists);
 }
@@ -276,6 +290,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
 
+#if defined USE_NVIDIA_RAFT
+
     if (config_.use_raft) {
         const raft::device_resources& raft_handle =
                 resources_->getRaftHandleCurrentDevice();
@@ -294,6 +310,12 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
                         raft::neighbors::ivf_flat::build(
                                 raft_handle, raft_idx_params, x, n, (idx_t)d)));
     }
+#else
+    if (config_.use_raft) {
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+    }
+#endif
 
     this->is_trained = true;
 }
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index e9ad2e62fc..4e8701ab03 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -361,7 +361,9 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
+#if defined USE_NVIDIA_RAFT
     raftHandles_.emplace(std::make_pair(device, defaultStream));
+#endif
 
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 726d62c1da..246fc18b16 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -7,11 +7,6 @@
 
 #pragma once
 
-#if defined USE_NVIDIA_RAFT
-#include <raft/core/handle.hpp>
-#include <raft/spatial/knn/ivf_flat_types.hpp>
-#endif
-
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFBase.cuh>
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 63d4936743..b754ee8876 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -19,12 +19,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <raft/neighbors/ivf_flat.cuh>
 
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuResources.h>
@@ -47,10 +44,11 @@
 #include <limits>
 #include <unordered_map>
 
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_codepacker.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/neighbors/ivf_flat_helpers.cuh>
-
-#include <raft/core/logger.hpp>
 
 namespace faiss {
 namespace gpu {
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 1cd2e18307..a9c013f68a 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -22,12 +22,11 @@
 
 #pragma once
 
-#include <raft/core/handle.hpp>
 #include <raft/neighbors/ivf_flat_types.hpp>
 
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/impl/IVFBase.cuh>
+#include <faiss/gpu/impl/IVFFlat.cuh>
 
 #include <faiss/impl/CodePacker.h>
 
@@ -85,7 +84,7 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<idx_t, 1, true>& indices) override;
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
-//     void reserveMemory(idx_t numVecs) override;
+    //     void reserveMemory(idx_t numVecs) override;
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
@@ -107,22 +106,25 @@ class RaftIVFFlat : public IVFFlat {
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    void set_index_(std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx);
+    /// Update the raft index
+    void set_index_(
+            std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx);
 
+    /// Filter out matrix rows containing NaN values
     void validRowIndices_(Tensor<float, 2, true>& vecs, bool* nan_flag);
-    
+
    protected:
-       /// Adds a set of codes and indices to a list, with the representation
-       /// coming from the CPU equivalent
-       void addEncodedVectorsToList_(
+    /// Adds a set of codes and indices to a list, with the representation
+    /// coming from the CPU equivalent
+    void addEncodedVectorsToList_(
             idx_t listId,
             // resident on the host
             const void* codes,
             // resident on the host
             const idx_t* indices,
             idx_t numVecs) override;
-        
-        /// Returns the number of bytes in which an IVF list containing numVecs
+
+    /// Returns the number of bytes in which an IVF list containing numVecs
     /// vectors is encoded on the device. Note that due to padding this is not
     /// the same as the encoding size for a subset of vectors in an IVF list;
     /// this is the size for an entire IVF list
@@ -132,17 +134,19 @@ class RaftIVFFlat : public IVFFlat {
             raft_knn_index{std::nullopt};
 };
 
-
 struct RaftIVFFlatCodePackerInterleaved : CodePacker {
-    RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chuk_size);
+    RaftIVFFlatCodePackerInterleaved(
+            size_t list_size,
+            uint32_t dim,
+            uint32_t chuk_size);
     void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
             const final;
     void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
             const final;
 
-    protected:
-        uint32_t chunk_size;
-        uint32_t dim;
+   protected:
+    uint32_t chunk_size;
+    uint32_t dim;
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 2f2fd87cd1..8c092be7cb 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -35,7 +35,6 @@
 constexpr float kF16MaxRelErr = 0.3f;
 constexpr float kF32MaxRelErr = 0.03f;
 
-
 struct Options {
     Options() {
         numAdd = 2 * faiss::gpu::randVal(2000, 5000);
@@ -56,6 +55,11 @@ struct Options {
                  faiss::gpu::INDICES_64_BIT});
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+        raftOpt.push_back(false);
+#if defined USE_NVIDIA_RAFT
+        raftOpt.push_back(true);
+#endif
     }
 
     std::string toString() const {
@@ -77,6 +81,7 @@ struct Options {
     int k;
     int device;
     faiss::gpu::IndicesOptions indicesOpt;
+    std::vector<bool> raftOpt;
 };
 
 void queryTest(
@@ -106,28 +111,35 @@ void queryTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
-        config.use_raft = true;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = opt.nprobe;
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                // FIXME: the fp16 bounds are
-                // useless when math (the accumulator) is
-                // in fp16. Figure out another way to test
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.65f : 0.015f);
+        for (int i = 0; i < opt.raftOpt.size(); i++) {
+            config.use_raft = opt.raftOpt[i];
+
+            faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                    &res,
+                    cpuIndex.d,
+                    cpuIndex.nlist,
+                    cpuIndex.metric_type,
+                    config);
+            gpuIndex.copyFrom(&cpuIndex);
+            gpuIndex.nprobe = opt.nprobe;
+
+            bool compFloat16 = useFloat16CoarseQuantizer;
+            faiss::gpu::compareIndices(
+                    cpuIndex,
+                    gpuIndex,
+                    opt.numQuery,
+                    opt.dim,
+                    opt.k,
+                    opt.toString(),
+                    compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                    // FIXME: the fp16 bounds are
+                    // useless when math (the accumulator) is
+                    // in fp16. Figure out another way to test
+                    compFloat16 ? 0.70f : 0.1f,
+                    compFloat16 ? 0.65f : 0.015f);
+        }
     }
 }
 
@@ -157,27 +169,39 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = true;
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = opt.nprobe;
 
-        cpuIndex.add(opt.numAdd, addVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
+        for (int i = 0; i < opt.raftOpt.size(); i++) {
+            printf("i %d\n", i);
+            config.use_raft = opt.raftOpt[i];
+
+            faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                    &res,
+                    cpuIndex.d,
+                    cpuIndex.nlist,
+                    cpuIndex.metric_type,
+                    config);
+            printf("gpuindex created");
+            gpuIndex.copyFrom(&cpuIndex);
+            printf("copyfrom done");
+            gpuIndex.nprobe = opt.nprobe;
+
+            cpuIndex.add(opt.numAdd, addVecs.data());
+            gpuIndex.add(opt.numAdd, addVecs.data());
+            printf("gpu vectors added");
+
+            bool compFloat16 = useFloat16CoarseQuantizer;
+            faiss::gpu::compareIndices(
+                    cpuIndex,
+                    gpuIndex,
+                    opt.numQuery,
+                    opt.dim,
+                    opt.k,
+                    opt.toString(),
+                    compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                    compFloat16 ? 0.70f : 0.1f,
+                    compFloat16 ? 0.30f : 0.015f);
+            printf("indices compared");
+        }
     }
 }
 
@@ -194,42 +218,46 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.nprobe = opt.nprobe;
+    for (int i = 0; i < opt.raftOpt.size(); i++) {
+        config.use_raft = opt.raftOpt[i];
 
-    // use garbage values to see if we overwrite then
-    faiss::IndexFlatL2 cpuQuantizer(1);
-    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
-    cpuIndex.nprobe = 1;
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+        gpuIndex.add(opt.numAdd, addVecs.data());
+        gpuIndex.nprobe = opt.nprobe;
+
+        // use garbage values to see if we overwrite then
+        faiss::IndexFlatL2 cpuQuantizer(1);
+        faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
+        cpuIndex.nprobe = 1;
+
+        gpuIndex.copyTo(&cpuIndex);
 
-    gpuIndex.copyTo(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
+        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+        EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+        EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
+        EXPECT_EQ(cpuIndex.d, opt.dim);
+        EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
+
+        testIVFEquality(cpuIndex, gpuIndex);
+
+        // Query both objects; results should be equivalent
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.30f : 0.015f);
+    }
 }
 
 void copyFromTest(bool useFloat16CoarseQuantizer) {
@@ -252,35 +280,39 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = true;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = 1;
-
-    gpuIndex.copyFrom(&cpuIndex);
-
-    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-    EXPECT_EQ(cpuIndex.d, opt.dim);
-    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-    testIVFEquality(cpuIndex, gpuIndex);
-
-    // Query both objects; results should be equivalent
-    bool compFloat16 = useFloat16CoarseQuantizer;
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            compFloat16 ? 0.70f : 0.1f,
-            compFloat16 ? 0.30f : 0.015f);
+
+    for (int i = 0; i < opt.raftOpt.size(); i++) {
+        config.use_raft = opt.raftOpt[i];
+
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, 1, 1, faiss::METRIC_L2, config);
+        gpuIndex.nprobe = 1;
+
+        gpuIndex.copyFrom(&cpuIndex);
+
+        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+        EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+        EXPECT_EQ(cpuIndex.d, opt.dim);
+        EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
+
+        testIVFEquality(cpuIndex, gpuIndex);
+
+        // Query both objects; results should be equivalent
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.30f : 0.015f);
+    }
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
@@ -399,36 +431,39 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
-    config.use_raft = true;
 
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.nprobe = opt.nprobe;
+    for (int i = 0; i < opt.raftOpt.size(); i++) {
+        config.use_raft = opt.raftOpt[i];
 
-    // Construct a positive test set
-    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.nprobe = opt.nprobe;
 
-    // Put all vecs on positive size
-    for (auto& f : queryVecs) {
-        f = std::abs(f);
-    }
+        // Construct a positive test set
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
 
-    bool compFloat16 = false;
-    faiss::gpu::compareIndices(
-            queryVecs,
-            cpuIndex,
-            gpuIndex,
-            opt.numQuery,
-            opt.dim,
-            opt.k,
-            opt.toString(),
-            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-            // FIXME: the fp16 bounds are
-            // useless when math (the accumulator) is
-            // in fp16. Figure out another way to test
-            compFloat16 ? 0.99f : 0.1f,
-            compFloat16 ? 0.65f : 0.015f);
+        // Put all vecs on positive size
+        for (auto& f : queryVecs) {
+            f = std::abs(f);
+        }
+
+        bool compFloat16 = false;
+        faiss::gpu::compareIndices(
+                queryVecs,
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                // FIXME: the fp16 bounds are
+                // useless when math (the accumulator) is
+                // in fp16. Figure out another way to test
+                compFloat16 ? 0.99f : 0.1f,
+                compFloat16 ? 0.65f : 0.015f);
+    }
 }
 
 //
@@ -448,31 +483,34 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_raft = true;
 
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
+    for (int i = 0; i < opt.raftOpt.size(); i++) {
+        config.use_raft = opt.raftOpt[i];
 
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-    gpuIndex.add(opt.numAdd, addVecs.data());
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+        gpuIndex.nprobe = opt.nprobe;
 
-    int numQuery = 10;
-    std::vector<float> nans(
-            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+        gpuIndex.add(opt.numAdd, addVecs.data());
+
+        int numQuery = 10;
+        std::vector<float> nans(
+                numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
-    std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
+        std::vector<float> distances(numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
 
-    gpuIndex.search(
-            numQuery, nans.data(), opt.k, distances.data(), indices.data());
+        gpuIndex.search(
+                numQuery, nans.data(), opt.k, distances.data(), indices.data());
 
-    for (int q = 0; q < numQuery; ++q) {
-        for (int k = 0; k < opt.k; ++k) {
-            EXPECT_EQ(indices[q * opt.k + k], -1);
-            EXPECT_EQ(
-                    distances[q * opt.k + k],
-                    std::numeric_limits<float>::max());
+        for (int q = 0; q < numQuery; ++q) {
+            for (int k = 0; k < opt.k; ++k) {
+                EXPECT_EQ(indices[q * opt.k + k], -1);
+                EXPECT_EQ(
+                        distances[q * opt.k + k],
+                        std::numeric_limits<float>::max());
+            }
         }
     }
 }
@@ -487,40 +525,45 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_raft = true;
 
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
+    for (int i = 0; i < opt.raftOpt.size(); i++) {
+        config.use_raft = opt.raftOpt[i];
 
-    int numNans = 10;
-    std::vector<float> nans(
-            numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+        gpuIndex.nprobe = opt.nprobe;
 
-    // Make one vector valid (not the first vector, in order to test offset
-    // issues), which should actually add
-    for (int i = 0; i < opt.dim; ++i) {
-        nans[opt.dim + i] = i;
-    }
+        int numNans = 10;
+        std::vector<float> nans(
+                numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
-    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
-    gpuIndex.train(opt.numTrain, trainVecs.data());
-
-    // should not crash
-    EXPECT_EQ(gpuIndex.ntotal, 0);
-    gpuIndex.add(numNans, nans.data());
-
-    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-    std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-    // should not crash
-    gpuIndex.search(
-            opt.numQuery,
-            queryVecs.data(),
-            opt.k,
-            distance.data(),
-            indices.data());
+        // Make one vector valid (not the first vector, in order to test offset
+        // issues), which should actually add
+        for (int i = 0; i < opt.dim; ++i) {
+            nans[opt.dim + i] = i;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // should not crash
+        EXPECT_EQ(gpuIndex.ntotal, 0);
+        gpuIndex.add(numNans, nans.data());
+
+        std::vector<float> queryVecs =
+                faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        std::vector<float> distance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
+
+        // should not crash
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                distance.data(),
+                indices.data());
+    }
 }
 
 TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
@@ -543,6 +586,11 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     int numQuery = 10;
     int k = 10;
     int nprobe = 8;
+    std::vector<bool> raftOpt;
+    raftOpt.push_back(false);
+    #if defined USE_NVIDIA_RAFT
+    raftOpt.push_back(true);
+    #endif
 
     std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
@@ -561,23 +609,25 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_raft = true;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.nprobe = nprobe;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
+    for (int i = 0; i < raftOpt.size(); i++) {
+        config.use_raft = raftOpt[i];
+
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, dim, numCentroids, faiss::METRIC_L2, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.nprobe = nprobe;
+
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                numQuery,
+                dim,
+                k,
+                "Unified Memory",
+                kF32MaxRelErr,
+                0.1f,
+                0.015f);
+    }
 }
 
 TEST(TestGpuIndexIVFFlat, LongIVFList) {
@@ -602,6 +652,11 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
     size_t numTrain = 100;
     int numQuery = 5;
     int k = 10;
+    std::vector<bool> raftOpt;
+    raftOpt.push_back(false);
+    #if defined USE_NVIDIA_RAFT
+    raftOpt.push_back(true);
+    #endif
 
     std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
@@ -619,24 +674,26 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
-    config.use_raft = true;
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, dim, numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.train(numTrain, trainVecs.data());
-    gpuIndex.add(numAdd, addVecs.data());
-    gpuIndex.nprobe = 1;
-
-    faiss::gpu::compareIndices(
-            cpuIndex,
-            gpuIndex,
-            numQuery,
-            dim,
-            k,
-            "Unified Memory",
-            kF32MaxRelErr,
-            0.1f,
-            0.015f);
+    for (int i = 0; i < raftOpt.size(); i++) {
+        config.use_raft = raftOpt[i];
+
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, dim, numCentroids, faiss::METRIC_L2, config);
+        gpuIndex.train(numTrain, trainVecs.data());
+        gpuIndex.add(numAdd, addVecs.data());
+        gpuIndex.nprobe = 1;
+
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                numQuery,
+                dim,
+                k,
+                "Unified Memory",
+                kF32MaxRelErr,
+                0.1f,
+                0.015f);
+    }
 }
 
 int main(int argc, char** argv) {

From db1774b7eeac7c63b8b08ef1a19123a0c3f6228a Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 14 Aug 2023 06:29:42 -0700
Subject: [PATCH 77/87] Update test

---
 faiss/gpu/GpuIndex.cu                  |   1 +
 faiss/gpu/GpuIndexIVF.cu               |   1 +
 faiss/gpu/GpuIndexIVFFlat.cu           |  11 +-
 faiss/gpu/impl/IVFBase.cu              |   2 +
 faiss/gpu/impl/RaftIVFFlat.cu          |  63 +--
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 582 +++++++++++++------------
 6 files changed, 344 insertions(+), 316 deletions(-)

diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 89952b1121..749bec221f 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -77,6 +77,7 @@ int GpuIndex::getDevice() const {
 }
 
 void GpuIndex::copyFrom(const faiss::Index* index) {
+    printf("inside gpuindex copyFrom\n");
     d = index->d;
     metric_type = index->metric_type;
     metric_arg = index->metric_arg;
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index f2ed323605..935c255b8f 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -212,6 +212,7 @@ void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
     index->nprobe = nprobe;
 
     FAISS_ASSERT(quantizer);
+    printf("index -> own_fields %d\n", index->own_fields);
     if (index->own_fields) {
         delete index->quantizer;
         index->quantizer = nullptr;
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 9422f6bc56..3a18aa0e90 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -110,6 +110,7 @@ void GpuIndexIVFFlat::set_index_(
 #if defined USE_NVIDIA_RAFT
 
     if (config_.use_raft) {
+        printf("inside GpuIndexIVFFlat's set_index_ use_raft = true\n");
         index_.reset(new RaftIVFFlat(
                 resources,
                 dim,
@@ -121,6 +122,7 @@ void GpuIndexIVFFlat::set_index_(
                 interleavedLayout,
                 indicesOptions,
                 space));
+        own_fields = false;
     } else
 #else
     if (config_.use_raft) {
@@ -128,7 +130,8 @@ void GpuIndexIVFFlat::set_index_(
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
 #endif
-    {
+    {   
+        printf("inside GpuIndexIVFFlat's set_index_ use_raft = false\n");
         index_.reset(new IVFFlat(
                 resources,
                 dim,
@@ -141,7 +144,6 @@ void GpuIndexIVFFlat::set_index_(
                 indicesOptions,
                 space));
     }
-
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
     updateQuantizer();
 }
@@ -156,12 +158,13 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 }
 
 void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
+    printf("Inside GpuIndexIVFFlat's copyFrom\n");
     DeviceScope scope(config_.device);
 
     // This will copy GpuIndexIVF data such as the coarse quantizer
     GpuIndexIVF::copyFrom(index);
 
-    printf("GpuIndexIVFcopyFrom done\n");
+    printf("GpuIndexIVF's copyFrom done\n");
 
     // Clear out our old data
     index_.reset();
@@ -192,7 +195,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             config_.memorySpace);
 
      // Copy all of the IVF data
-    printf("Copying inverted lists from cpu index to FAISS gpu index flat\n");
+    printf("Copying inverted lists from cpu index to FAISS gpu index ivfflat\n");
     index_->copyInvertedListsFrom(index->invlists);
 }
 
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 890d489440..6aef83ef3f 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -106,6 +106,7 @@ void IVFBase::reserveMemory(idx_t numVecs) {
 }
 
 void IVFBase::reset() {
+    printf("inside ivfbase::reset\n");
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     deviceListData_.clear();
@@ -323,6 +324,7 @@ std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
 }
 
 void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
+    printf("Inside IVFBase's copyInvertedListsFrom\n");
     idx_t nlist = ivf ? ivf->nlist : 0;
     for (idx_t i = 0; i < nlist; ++i) {
         addEncodedVectorsToList_(
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index b754ee8876..110a0f0ced 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -73,7 +73,7 @@ RaftIVFFlat::RaftIVFFlat(
                   scalarQ,
                   interleavedLayout,
                   indicesOptions,
-                  space) {}
+                  space) {printf("RaftIVFFlat constructor called\n"); reset();}
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
@@ -121,33 +121,33 @@ void RaftIVFFlat::search(
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
-    validRowIndices_(queries, nan_flag.data_handle());
-
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_inds = outIndices.data(),
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return idx_t(-1);
-                return out_inds[i];
-            });
-
-    float max_val = std::numeric_limits<float>::max();
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
-            [nan_flag = nan_flag.data_handle(),
-             out_dists = outDistances.data(),
-             max_val,
-             k_] __device__(uint32_t i) {
-                uint32_t row = i / k_;
-                if (!nan_flag[row])
-                    return max_val;
-                return out_dists[i];
-            });
+//     validRowIndices_(queries, nan_flag.data_handle());
+
+//     raft::linalg::map_offset(
+//             raft_handle,
+//             raft::make_device_vector_view(outIndices.data(), numQueries * k_),
+//             [nan_flag = nan_flag.data_handle(),
+//              out_inds = outIndices.data(),
+//              k_] __device__(uint32_t i) {
+//                 uint32_t row = i / k_;
+//                 if (!nan_flag[row])
+//                     return idx_t(-1);
+//                 return out_inds[i];
+//             });
+
+//     float max_val = std::numeric_limits<float>::max();
+//     raft::linalg::map_offset(
+//             raft_handle,
+//             raft::make_device_vector_view(outDistances.data(), numQueries * k_),
+//             [nan_flag = nan_flag.data_handle(),
+//              out_dists = outDistances.data(),
+//              max_val,
+//              k_] __device__(uint32_t i) {
+//                 uint32_t row = i / k_;
+//                 if (!nan_flag[row])
+//                     return max_val;
+//                 return out_dists[i];
+//             });
 }
 
 /// Classify and encode/add vectors to our IVF lists.
@@ -175,6 +175,7 @@ idx_t RaftIVFFlat::addVectors(
             0);
 
     if (n_rows_valid < n_rows) {
+        printf("NaN values found");
         auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
                 raft_handle, n_rows_valid);
 
@@ -270,6 +271,10 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 std::vector<uint8_t> RaftIVFFlat::getListVectorData(
         idx_t listId,
         bool gpuFormat) const {
+    if (gpuFormat) {
+        FAISS_THROW_MSG("gpuFormat is not suppported for raft indices");
+    }
+    printf("inside getlistvectordata of raft");
     FAISS_ASSERT(raft_knn_index.has_value());
 
     const raft::device_resources& raft_handle =
@@ -466,6 +471,8 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     // This list must already exist
     FAISS_ASSERT(raft_knn_index.has_value());
 
+    printf("getListLength(listId), %d\n", getListLength(listId));
+
     // This list must currently be empty
     FAISS_ASSERT(getListLength(listId) == 0);
 
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 8c092be7cb..e3b9540d46 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -42,7 +42,7 @@ struct Options {
 
         numCentroids = std::sqrt((float)numAdd / 2);
         numTrain = numCentroids * 40;
-        nprobe = faiss::gpu::randVal(std::min(50, numCentroids), numCentroids);
+        nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids);
         numQuery = faiss::gpu::randVal(32, 100);
 
         // Due to the approximate nature of the query and of floating point
@@ -56,10 +56,7 @@ struct Options {
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-        raftOpt.push_back(false);
-#if defined USE_NVIDIA_RAFT
-        raftOpt.push_back(true);
-#endif
+        use_raft = false;
     }
 
     std::string toString() const {
@@ -67,7 +64,7 @@ struct Options {
         str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
             << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
             << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt;
+            << indicesOpt <<" use_raft "<< use_raft;
 
         return str.str();
     }
@@ -81,7 +78,7 @@ struct Options {
     int k;
     int device;
     faiss::gpu::IndicesOptions indicesOpt;
-    std::vector<bool> raftOpt;
+    bool use_raft;
 };
 
 void queryTest(
@@ -112,38 +109,31 @@ void queryTest(
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.use_raft = opt.use_raft;
 
-        for (int i = 0; i < opt.raftOpt.size(); i++) {
-            config.use_raft = opt.raftOpt[i];
-
-            faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                    &res,
-                    cpuIndex.d,
-                    cpuIndex.nlist,
-                    cpuIndex.metric_type,
-                    config);
-            gpuIndex.copyFrom(&cpuIndex);
-            gpuIndex.nprobe = opt.nprobe;
-
-            bool compFloat16 = useFloat16CoarseQuantizer;
-            faiss::gpu::compareIndices(
-                    cpuIndex,
-                    gpuIndex,
-                    opt.numQuery,
-                    opt.dim,
-                    opt.k,
-                    opt.toString(),
-                    compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                    // FIXME: the fp16 bounds are
-                    // useless when math (the accumulator) is
-                    // in fp16. Figure out another way to test
-                    compFloat16 ? 0.70f : 0.1f,
-                    compFloat16 ? 0.65f : 0.015f);
-        }
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.nprobe = opt.nprobe;
+
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                // FIXME: the fp16 bounds are
+                // useless when math (the accumulator) is
+                // in fp16. Figure out another way to test
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.65f : 0.015f);
     }
 }
 
-void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
+void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer, bool use_raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
 
@@ -169,43 +159,31 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) {
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.use_raft = use_raft;
 
-        for (int i = 0; i < opt.raftOpt.size(); i++) {
-            printf("i %d\n", i);
-            config.use_raft = opt.raftOpt[i];
-
-            faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                    &res,
-                    cpuIndex.d,
-                    cpuIndex.nlist,
-                    cpuIndex.metric_type,
-                    config);
-            printf("gpuindex created");
-            gpuIndex.copyFrom(&cpuIndex);
-            printf("copyfrom done");
-            gpuIndex.nprobe = opt.nprobe;
-
-            cpuIndex.add(opt.numAdd, addVecs.data());
-            gpuIndex.add(opt.numAdd, addVecs.data());
-            printf("gpu vectors added");
-
-            bool compFloat16 = useFloat16CoarseQuantizer;
-            faiss::gpu::compareIndices(
-                    cpuIndex,
-                    gpuIndex,
-                    opt.numQuery,
-                    opt.dim,
-                    opt.k,
-                    opt.toString(),
-                    compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                    compFloat16 ? 0.70f : 0.1f,
-                    compFloat16 ? 0.30f : 0.015f);
-            printf("indices compared");
-        }
+        faiss::gpu::GpuIndexIVFFlat gpuIndex(
+                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+        gpuIndex.copyFrom(&cpuIndex);
+        gpuIndex.nprobe = opt.nprobe;
+
+        cpuIndex.add(opt.numAdd, addVecs.data());
+        gpuIndex.add(opt.numAdd, addVecs.data());
+
+        bool compFloat16 = useFloat16CoarseQuantizer;
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+                compFloat16 ? 0.70f : 0.1f,
+                compFloat16 ? 0.30f : 0.015f);
     }
 }
 
-void copyToTest(bool useFloat16CoarseQuantizer) {
+void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -217,50 +195,47 @@ void copyToTest(bool useFloat16CoarseQuantizer) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+    config.use_raft = use_raft;
 
-    for (int i = 0; i < opt.raftOpt.size(); i++) {
-        config.use_raft = opt.raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
-        gpuIndex.nprobe = opt.nprobe;
-
-        // use garbage values to see if we overwrite then
-        faiss::IndexFlatL2 cpuQuantizer(1);
-        faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
-        cpuIndex.nprobe = 1;
-
-        gpuIndex.copyTo(&cpuIndex);
-
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.nprobe = opt.nprobe;
 
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
-        EXPECT_EQ(cpuIndex.d, opt.dim);
-        EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-        testIVFEquality(cpuIndex, gpuIndex);
+    // use garbage values to see if we overwrite then
+    faiss::IndexFlatL2 cpuQuantizer(1);
+    faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2);
+    cpuIndex.nprobe = 1;
 
-        // Query both objects; results should be equivalent
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
-    }
+    gpuIndex.copyTo(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
 }
 
-void copyFromTest(bool useFloat16CoarseQuantizer) {
+void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -280,55 +255,67 @@ void copyFromTest(bool useFloat16CoarseQuantizer) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-
-    for (int i = 0; i < opt.raftOpt.size(); i++) {
-        config.use_raft = opt.raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, 1, 1, faiss::METRIC_L2, config);
-        gpuIndex.nprobe = 1;
-
-        gpuIndex.copyFrom(&cpuIndex);
-
-        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-        EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
-
-        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-        EXPECT_EQ(cpuIndex.d, opt.dim);
-        EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
-
-        testIVFEquality(cpuIndex, gpuIndex);
-
-        // Query both objects; results should be equivalent
-        bool compFloat16 = useFloat16CoarseQuantizer;
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                compFloat16 ? 0.70f : 0.1f,
-                compFloat16 ? 0.30f : 0.015f);
-    }
+    config.use_raft = use_raft;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = 1;
+
+    gpuIndex.copyFrom(&cpuIndex);
+
+    EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+    EXPECT_EQ(gpuIndex.ntotal, opt.numAdd);
+
+    EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+    EXPECT_EQ(cpuIndex.d, opt.dim);
+    EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
+
+    testIVFEquality(cpuIndex, gpuIndex);
+
+    // Query both objects; results should be equivalent
+    bool compFloat16 = useFloat16CoarseQuantizer;
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            compFloat16 ? 0.70f : 0.1f,
+            compFloat16 ? 0.30f : 0.015f);
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
-    addTest(faiss::METRIC_L2, false);
+    addTest(faiss::METRIC_L2, false, false);
+
+#if defined USE_NVIDIA_RAFT
+    addTest(faiss::METRIC_L2, false, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, false);
+    addTest(faiss::METRIC_INNER_PRODUCT, false, false);
+
+#if defined USE_NVIDIA_RAFT
+    addTest(faiss::METRIC_INNER_PRODUCT, false, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
-    addTest(faiss::METRIC_L2, true);
+    addTest(faiss::METRIC_L2, true, false);
+
+#if defined USE_NVIDIA_RAFT
+    addTest(faiss::METRIC_L2, true, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
-    addTest(faiss::METRIC_INNER_PRODUCT, true);
+    addTest(faiss::METRIC_INNER_PRODUCT, true, false);
+
+#if defined USE_NVIDIA_RAFT
+    addTest(faiss::METRIC_INNER_PRODUCT, true, true);
+#endif
 }
 
 //
@@ -336,11 +323,23 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
 //
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
-    queryTest(Options(), faiss::METRIC_L2, false);
+    Options opt;
+    queryTest(opt, faiss::METRIC_L2, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, false);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
-    queryTest(Options(), faiss::METRIC_INNER_PRODUCT, false);
+    Options opt;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, LargeBatch) {
@@ -348,16 +347,33 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     opt.dim = 3;
     opt.numQuery = 100000;
     queryTest(opt, faiss::METRIC_L2, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, false);
+#endif
 }
 
 // float16 coarse quantizer
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
-    queryTest(Options(), faiss::METRIC_L2, true);
+    Options opt;
+    queryTest(opt, faiss::METRIC_L2, true);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
-    queryTest(Options(), faiss::METRIC_INNER_PRODUCT, true);
+    Options opt;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
+#endif
 }
 
 //
@@ -369,24 +385,44 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     Options opt;
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_L2, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, false);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     Options opt;
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     Options opt;
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_L2, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, false);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     Options opt;
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
+#endif
 }
 
 //
@@ -394,11 +430,19 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
 //
 
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
-    copyToTest(false);
+    copyToTest(false, false);
+
+#if defined USE_NVIDIA_RAFT
+    copyToTest(false, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) {
-    copyFromTest(false);
+    copyFromTest(false, false);
+
+#if defined USE_NVIDIA_RAFT
+    copyFromTest(false, true);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_negative) {
@@ -432,38 +476,34 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
 
-    for (int i = 0; i < opt.raftOpt.size(); i++) {
-        config.use_raft = opt.raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = opt.nprobe;
-
-        // Construct a positive test set
-        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.nprobe = opt.nprobe;
 
-        // Put all vecs on positive size
-        for (auto& f : queryVecs) {
-            f = std::abs(f);
-        }
+    // Construct a positive test set
+    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
 
-        bool compFloat16 = false;
-        faiss::gpu::compareIndices(
-                queryVecs,
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-                // FIXME: the fp16 bounds are
-                // useless when math (the accumulator) is
-                // in fp16. Figure out another way to test
-                compFloat16 ? 0.99f : 0.1f,
-                compFloat16 ? 0.65f : 0.015f);
+    // Put all vecs on positive size
+    for (auto& f : queryVecs) {
+        f = std::abs(f);
     }
+
+    bool compFloat16 = false;
+    faiss::gpu::compareIndices(
+            queryVecs,
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            // FIXME: the fp16 bounds are
+            // useless when math (the accumulator) is
+            // in fp16. Figure out another way to test
+            compFloat16 ? 0.99f : 0.1f,
+            compFloat16 ? 0.65f : 0.015f);
 }
 
 //
@@ -484,33 +524,29 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
 
-    for (int i = 0; i < opt.raftOpt.size(); i++) {
-        config.use_raft = opt.raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-        gpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
 
-        int numQuery = 10;
-        std::vector<float> nans(
-                numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+    int numQuery = 10;
+    std::vector<float> nans(
+            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
-        std::vector<float> distances(numQuery * opt.k, 0);
-        std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
+    std::vector<float> distances(numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
 
-        gpuIndex.search(
-                numQuery, nans.data(), opt.k, distances.data(), indices.data());
+    gpuIndex.search(
+            numQuery, nans.data(), opt.k, distances.data(), indices.data());
 
-        for (int q = 0; q < numQuery; ++q) {
-            for (int k = 0; k < opt.k; ++k) {
-                EXPECT_EQ(indices[q * opt.k + k], -1);
-                EXPECT_EQ(
-                        distances[q * opt.k + k],
-                        std::numeric_limits<float>::max());
-            }
+    for (int q = 0; q < numQuery; ++q) {
+        for (int k = 0; k < opt.k; ++k) {
+            EXPECT_EQ(indices[q * opt.k + k], -1);
+            EXPECT_EQ(
+                    distances[q * opt.k + k],
+                    std::numeric_limits<float>::max());
         }
     }
 }
@@ -526,44 +562,38 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
 
-    for (int i = 0; i < opt.raftOpt.size(); i++) {
-        config.use_raft = opt.raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        int numNans = 10;
-        std::vector<float> nans(
-                numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
+    int numNans = 10;
+    std::vector<float> nans(
+            numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
-        // Make one vector valid (not the first vector, in order to test offset
-        // issues), which should actually add
-        for (int i = 0; i < opt.dim; ++i) {
-            nans[opt.dim + i] = i;
-        }
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-
-        // should not crash
-        EXPECT_EQ(gpuIndex.ntotal, 0);
-        gpuIndex.add(numNans, nans.data());
-
-        std::vector<float> queryVecs =
-                faiss::gpu::randVecs(opt.numQuery, opt.dim);
-        std::vector<float> distance(opt.numQuery * opt.k, 0);
-        std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
-
-        // should not crash
-        gpuIndex.search(
-                opt.numQuery,
-                queryVecs.data(),
-                opt.k,
-                distance.data(),
-                indices.data());
+    // Make one vector valid (not the first vector, in order to test offset
+    // issues), which should actually add
+    for (int i = 0; i < opt.dim; ++i) {
+        nans[opt.dim + i] = i;
     }
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+
+    // should not crash
+    EXPECT_EQ(gpuIndex.ntotal, 0);
+    gpuIndex.add(numNans, nans.data());
+
+    std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+    std::vector<float> distance(opt.numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
+
+    // should not crash
+    gpuIndex.search(
+            opt.numQuery,
+            queryVecs.data(),
+            opt.k,
+            distance.data(),
+            indices.data());
 }
 
 TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
@@ -586,11 +616,6 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     int numQuery = 10;
     int k = 10;
     int nprobe = 8;
-    std::vector<bool> raftOpt;
-    raftOpt.push_back(false);
-    #if defined USE_NVIDIA_RAFT
-    raftOpt.push_back(true);
-    #endif
 
     std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
@@ -609,25 +634,22 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    for (int i = 0; i < raftOpt.size(); i++) {
-        config.use_raft = raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, dim, numCentroids, faiss::METRIC_L2, config);
-        gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.nprobe = nprobe;
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                numQuery,
-                dim,
-                k,
-                "Unified Memory",
-                kF32MaxRelErr,
-                0.1f,
-                0.015f);
-    }
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.copyFrom(&cpuIndex);
+    gpuIndex.nprobe = nprobe;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
 }
 
 TEST(TestGpuIndexIVFFlat, LongIVFList) {
@@ -652,11 +674,6 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
     size_t numTrain = 100;
     int numQuery = 5;
     int k = 10;
-    std::vector<bool> raftOpt;
-    raftOpt.push_back(false);
-    #if defined USE_NVIDIA_RAFT
-    raftOpt.push_back(true);
-    #endif
 
     std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
@@ -674,26 +691,23 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
-    for (int i = 0; i < raftOpt.size(); i++) {
-        config.use_raft = raftOpt[i];
-
-        faiss::gpu::GpuIndexIVFFlat gpuIndex(
-                &res, dim, numCentroids, faiss::METRIC_L2, config);
-        gpuIndex.train(numTrain, trainVecs.data());
-        gpuIndex.add(numAdd, addVecs.data());
-        gpuIndex.nprobe = 1;
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                numQuery,
-                dim,
-                k,
-                "Unified Memory",
-                kF32MaxRelErr,
-                0.1f,
-                0.015f);
-    }
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.train(numTrain, trainVecs.data());
+    gpuIndex.add(numAdd, addVecs.data());
+    gpuIndex.nprobe = 1;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
 }
 
 int main(int argc, char** argv) {
@@ -703,4 +717,4 @@ int main(int argc, char** argv) {
     faiss::gpu::setTestSeed(100);
 
     return RUN_ALL_TESTS();
-}
+}
\ No newline at end of file

From 8cf7e057596abe909555c1891dcbc7ab15ddedcf Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 15 Aug 2023 17:03:54 -0700
Subject: [PATCH 78/87] update quantizer

---
 faiss/gpu/GpuIndexIVF.cu               | 50 ++++++++++++++---
 faiss/gpu/GpuIndexIVFFlat.cu           | 75 +++++++++-----------------
 faiss/gpu/impl/RaftIVFFlat.cu          | 61 +++++++++++----------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |  1 +
 4 files changed, 102 insertions(+), 85 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 935c255b8f..c83008307d 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -16,6 +16,11 @@
 #include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
+#if defined USE_NVIDIA_RAFT
+#include <raft/core/handle.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
+#endif
+
 namespace faiss {
 namespace gpu {
 
@@ -212,7 +217,6 @@ void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
     index->nprobe = nprobe;
 
     FAISS_ASSERT(quantizer);
-    printf("index -> own_fields %d\n", index->own_fields);
     if (index->own_fields) {
         delete index->quantizer;
         index->quantizer = nullptr;
@@ -445,14 +449,46 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
         printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
     }
 
-    // leverage the CPU-side k-means code, which works for the GPU
-    // flat index as well
     quantizer->reset();
-    Clustering clus(this->d, nlist, this->cp);
-    clus.verbose = verbose;
-    clus.train(n, x, *quantizer);
-    quantizer->is_trained = true;
 
+#if defined USE_NVIDIA_RAFT
+
+    if (config_.use_raft) {
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = metric_type == faiss::METRIC_L2
+                ? raft::distance::DistanceType::L2Expanded
+                : raft::distance::DistanceType::InnerProduct;
+        raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_trainset_fraction = 1.0;
+        raft_idx_params.kmeans_n_iters = cp.niter;
+        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
+
+        auto raft_index = raft::neighbors::ivf_flat::build(
+                raft_handle, raft_idx_params, x, n, (idx_t)d);
+
+        raft_handle.sync_stream();
+
+        quantizer->train(nlist, raft_index.centers().data_handle());
+        quantizer->add(nlist, raft_index.centers().data_handle());
+    } else
+#else
+    if (config_.use_raft) {
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+    } else
+#endif
+    {
+        // leverage the CPU-side k-means code, which works for the GPU
+        // flat index as well
+        Clustering clus(this->d, nlist, this->cp);
+        clus.verbose = verbose;
+        clus.train(n, x, *quantizer);
+    }
+    quantizer->is_trained = true;
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
 
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 3a18aa0e90..750096e153 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -17,11 +17,6 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
 #endif
 
 #include <limits>
@@ -90,6 +85,8 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
                 ivfFlatConfig_.interleavedLayout,
                 ivfFlatConfig_.indicesOptions,
                 config_.memorySpace);
+        baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
+        updateQuantizer();
     }
 }
 
@@ -110,7 +107,6 @@ void GpuIndexIVFFlat::set_index_(
 #if defined USE_NVIDIA_RAFT
 
     if (config_.use_raft) {
-        printf("inside GpuIndexIVFFlat's set_index_ use_raft = true\n");
         index_.reset(new RaftIVFFlat(
                 resources,
                 dim,
@@ -122,7 +118,6 @@ void GpuIndexIVFFlat::set_index_(
                 interleavedLayout,
                 indicesOptions,
                 space));
-        own_fields = false;
     } else
 #else
     if (config_.use_raft) {
@@ -130,8 +125,7 @@ void GpuIndexIVFFlat::set_index_(
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
 #endif
-    {   
-        printf("inside GpuIndexIVFFlat's set_index_ use_raft = false\n");
+    {
         index_.reset(new IVFFlat(
                 resources,
                 dim,
@@ -144,8 +138,6 @@ void GpuIndexIVFFlat::set_index_(
                 indicesOptions,
                 space));
     }
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
 }
 
 void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
@@ -158,20 +150,15 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 }
 
 void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-    printf("Inside GpuIndexIVFFlat's copyFrom\n");
     DeviceScope scope(config_.device);
 
     // This will copy GpuIndexIVF data such as the coarse quantizer
     GpuIndexIVF::copyFrom(index);
 
-    printf("GpuIndexIVF's copyFrom done\n");
-
     // Clear out our old data
     index_.reset();
     baseIndex_.reset();
 
-    printf("indices reset\n");
-
     // The other index might not be trained
     if (!index->is_trained) {
         FAISS_ASSERT(!is_trained);
@@ -193,9 +180,10 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
             ivfFlatConfig_.interleavedLayout,
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
+    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
+    updateQuantizer();
 
-     // Copy all of the IVF data
-    printf("Copying inverted lists from cpu index to FAISS gpu index ivfflat\n");
+    // Copy all of the IVF data
     index_->copyInvertedListsFrom(index->invlists);
 }
 
@@ -264,15 +252,25 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
-    // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
-    if (!config_.use_raft) {
+#if defined USE_NVIDIA_RAFT
+    if (config_.use_raft) {
+        // No need to copy the data to host
+        trainQuantizer_(n, x);
+    } else
+#else
+    if (config_.use_raft) {
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+    } else
+#endif
+    {
+        // FIXME: GPUize more of this
+        // First, make sure that the data is resident on the CPU, if it is not
+        // on the CPU, as we depend upon parts of the CPU code
         auto hostData = toHost<float, 2>(
                 (float*)x,
                 resources_->getDefaultStream(config_.device),
                 {n, this->d});
-
         trainQuantizer_(n, hostData.data());
     }
 
@@ -288,38 +286,13 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
             ivfFlatConfig_.interleavedLayout,
             ivfFlatConfig_.indicesOptions,
             config_.memorySpace);
+    baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
+    updateQuantizer();
 
-    if (!config_.use_raft && reserveMemoryVecs_) {
+    if (reserveMemoryVecs_) {
         index_->reserveMemory(reserveMemoryVecs_);
     }
 
-#if defined USE_NVIDIA_RAFT
-
-    if (config_.use_raft) {
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        raft::neighbors::ivf_flat::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = raft::distance::DistanceType::L2Expanded;
-        raft_idx_params.add_data_on_build = false;
-        raft_idx_params.kmeans_trainset_fraction = 1.0;
-        raft_idx_params.kmeans_n_iters = cp.niter;
-        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
-
-        std::dynamic_pointer_cast<RaftIVFFlat>(index_)->set_index_(
-                std::make_optional<
-                        raft::neighbors::ivf_flat::index<float, idx_t>>(
-                        raft::neighbors::ivf_flat::build(
-                                raft_handle, raft_idx_params, x, n, (idx_t)d)));
-    }
-#else
-    if (config_.use_raft) {
-        FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
-    }
-#endif
-
     this->is_trained = true;
 }
 
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 110a0f0ced..b17b35d5ee 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -121,33 +121,34 @@ void RaftIVFFlat::search(
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
-//     validRowIndices_(queries, nan_flag.data_handle());
-
-//     raft::linalg::map_offset(
-//             raft_handle,
-//             raft::make_device_vector_view(outIndices.data(), numQueries * k_),
-//             [nan_flag = nan_flag.data_handle(),
-//              out_inds = outIndices.data(),
-//              k_] __device__(uint32_t i) {
-//                 uint32_t row = i / k_;
-//                 if (!nan_flag[row])
-//                     return idx_t(-1);
-//                 return out_inds[i];
-//             });
-
-//     float max_val = std::numeric_limits<float>::max();
-//     raft::linalg::map_offset(
-//             raft_handle,
-//             raft::make_device_vector_view(outDistances.data(), numQueries * k_),
-//             [nan_flag = nan_flag.data_handle(),
-//              out_dists = outDistances.data(),
-//              max_val,
-//              k_] __device__(uint32_t i) {
-//                 uint32_t row = i / k_;
-//                 if (!nan_flag[row])
-//                     return max_val;
-//                 return out_dists[i];
-//             });
+    validRowIndices_(queries, nan_flag.data_handle());
+
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_inds = outIndices.data(),
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return idx_t(-1);
+                return out_inds[i];
+            });
+
+    float max_val = std::numeric_limits<float>::max();
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_dists = outDistances.data(),
+             max_val,
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return max_val;
+                return out_dists[i];
+            });
+    raft_handle.sync_stream();
 }
 
 /// Classify and encode/add vectors to our IVF lists.
@@ -158,6 +159,8 @@ idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
+
+    raft::print_device_vector("raft_centers from addVectors", raft_knn_index.value().centers().data_handle(), dim_ * this->numLists_, std::cout);
     idx_t n_rows = vecs.getSize(0);
 
     const raft::device_resources& raft_handle =
@@ -173,6 +176,7 @@ idx_t RaftIVFFlat::addVectors(
             nan_flag.data_handle(),
             nan_flag.data_handle() + n_rows,
             0);
+    printf("n_rows_valid %d %d\n", n_rows_valid, n_rows);
 
     if (n_rows_valid < n_rows) {
         printf("NaN values found");
@@ -366,6 +370,9 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             buf_host.data(),
             total_elems,
             stream);
+    thrust::fill_n(handle.get_thrust_policy(), raft_knn_index.value().list_sizes().data_handle(), pams.n_lists, 0);	
+    
+    raft::print_device_vector("raft_idx_centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
 }
 
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index e3b9540d46..4120f4ba73 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -199,6 +199,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    printf("opt.numCentroids %d", opt.numCentroids);
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
     gpuIndex.nprobe = opt.nprobe;

From a17b1f32b22ac43bd90bd2cdaaf9eb7e8de0575b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Aug 2023 12:33:23 -0700
Subject: [PATCH 79/87] All except LongIVFList passing

---
 faiss/gpu/impl/RaftIVFFlat.cu          | 70 +++++++++++-----------
 faiss/gpu/impl/RaftIVFFlat.cuh         |  8 +--
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 81 +++++++++++++++++++++-----
 3 files changed, 105 insertions(+), 54 deletions(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index b17b35d5ee..3861a2283c 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -46,9 +46,8 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
 #include <raft/neighbors/ivf_flat_codepacker.hpp>
-#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/neighbors/ivf_flat.cuh>
 
 namespace faiss {
 namespace gpu {
@@ -73,7 +72,9 @@ RaftIVFFlat::RaftIVFFlat(
                   scalarQ,
                   interleavedLayout,
                   indicesOptions,
-                  space) {printf("RaftIVFFlat constructor called\n"); reset();}
+                  space) {
+    reset();
+}
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
@@ -103,12 +104,12 @@ void RaftIVFFlat::search(
     raft::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
-    auto queries_view = raft::make_device_matrix_view<const float>(
-            queries.data(), numQueries, cols);
-    auto out_inds_view = raft::make_device_matrix_view<idx_t>(
-            outIndices.data(), numQueries, k_);
-    auto out_dists_view = raft::make_device_matrix_view<float>(
-            outDistances.data(), numQueries, k_);
+    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
+            queries.data(), (idx_t)numQueries, (idx_t)cols);
+    auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
+            outIndices.data(), (idx_t)numQueries, (idx_t)k_);
+    auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
+            outDistances.data(), (idx_t)numQueries, (idx_t)k_);
 
     raft::neighbors::ivf_flat::search<float, idx_t>(
             raft_handle,
@@ -148,7 +149,6 @@ void RaftIVFFlat::search(
                     return max_val;
                 return out_dists[i];
             });
-    raft_handle.sync_stream();
 }
 
 /// Classify and encode/add vectors to our IVF lists.
@@ -159,8 +159,6 @@ idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
-
-    raft::print_device_vector("raft_centers from addVectors", raft_knn_index.value().centers().data_handle(), dim_ * this->numLists_, std::cout);
     idx_t n_rows = vecs.getSize(0);
 
     const raft::device_resources& raft_handle =
@@ -176,10 +174,8 @@ idx_t RaftIVFFlat::addVectors(
             nan_flag.data_handle(),
             nan_flag.data_handle() + n_rows,
             0);
-    printf("n_rows_valid %d %d\n", n_rows_valid, n_rows);
 
     if (n_rows_valid < n_rows) {
-        printf("NaN values found");
         auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
                 raft_handle, n_rows_valid);
 
@@ -236,13 +232,14 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const {
             resources_->getRaftHandleCurrentDevice();
 
     uint32_t size;
-    raft::copy(
+    raft::update_host(
             &size,
             raft_knn_index.value().list_sizes().data_handle() + listId,
             1,
             raft_handle.get_stream());
     raft_handle.sync_stream();
-    return int(size);
+
+    return static_cast<int>(size);
 }
 
 /// Return the list indices of a particular list back to the CPU
@@ -256,6 +253,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 
     std::vector<idx_t> vec(listSize);
 
+    // fetch the list indices ptr on host
     idx_t* list_indices_ptr;
 
     // fetch the list indices ptr on host
@@ -268,6 +266,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 
     raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
     raft_handle.sync_stream();
+
     return vec;
 }
 
@@ -278,7 +277,6 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(
     if (gpuFormat) {
         FAISS_THROW_MSG("gpuFormat is not suppported for raft indices");
     }
-    printf("inside getlistvectordata of raft");
     FAISS_ASSERT(raft_knn_index.has_value());
 
     const raft::device_resources& raft_handle =
@@ -335,9 +333,9 @@ void RaftIVFFlat::searchPreassigned(
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     idx_t quantizer_ntotal = quantizer->ntotal;
 
-    const raft::device_resources& handle =
+    const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    auto stream = handle.get_stream();
+    auto stream = raft_handle.get_stream();
 
     auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
 
@@ -359,7 +357,23 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             FAISS_THROW_MSG("Metric is not supported.");
     }
 
-    raft_knn_index.emplace(handle, pams, (uint32_t)this->dim_);
+    raft_knn_index.emplace(raft_handle, pams, (uint32_t)this->dim_);
+
+    cudaMemsetAsync(
+            raft_knn_index.value().list_sizes().data_handle(),
+            0,
+            raft_knn_index.value().list_sizes().size() * sizeof(uint32_t),
+            stream);
+    cudaMemsetAsync(
+            raft_knn_index.value().data_ptrs().data_handle(),
+            0,
+            raft_knn_index.value().data_ptrs().size() * sizeof(float*),
+            stream);
+    cudaMemsetAsync(
+            raft_knn_index.value().inds_ptrs().data_handle(),
+            0,
+            raft_knn_index.value().inds_ptrs().size() * sizeof(idx_t*),
+            stream);
 
     /// Copy (reconstructed) centroids over, rather than re-training
     std::vector<float> buf_host(total_elems);
@@ -370,9 +384,6 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             buf_host.data(),
             total_elems,
             stream);
-    thrust::fill_n(handle.get_thrust_policy(), raft_knn_index.value().list_sizes().data_handle(), pams.n_lists, 0);	
-    
-    raft::print_device_vector("raft_idx_centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout);
 }
 
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
@@ -435,7 +446,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     if (this->metric_ == faiss::METRIC_L2) {
         raft_knn_index.value().allocate_center_norms(raft_handle);
         raft::linalg::rowNorm(
-                raft_knn_index.value().center_norms()->data_handle(),
+                raft_knn_index.value().center_norms().value().data_handle(),
                 raft_knn_index.value().centers().data_handle(),
                 raft_knn_index.value().dim(),
                 (uint32_t)nlist,
@@ -443,12 +454,6 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
                 true,
                 raft_handle.get_stream());
     }
-    raft_handle.sync_stream();
-}
-
-void RaftIVFFlat::set_index_(
-        std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx) {
-    raft_knn_index.emplace(std::move(idx.value()));
 }
 
 size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
@@ -478,8 +483,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     // This list must already exist
     FAISS_ASSERT(raft_knn_index.has_value());
 
-    printf("getListLength(listId), %d\n", getListLength(listId));
-
     // This list must currently be empty
     FAISS_ASSERT(getListLength(listId) == 0);
 
@@ -520,7 +523,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
             interleaved_codes.data(),
             gpuListSizeInBytes,
             stream);
-    raft_handle.sync_stream();
 
     /// Handle the indices as well
     idx_t* list_indices_ptr;
@@ -532,8 +534,8 @@ void RaftIVFFlat::addEncodedVectorsToList_(
             1,
             stream);
     raft_handle.sync_stream();
+
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
-    raft_handle.sync_stream();
 }
 
 void RaftIVFFlat::validRowIndices_(
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index a9c013f68a..3aba501c9f 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -22,14 +22,14 @@
 
 #pragma once
 
-#include <raft/neighbors/ivf_flat_types.hpp>
-
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 
 #include <faiss/impl/CodePacker.h>
 
+#include <raft/neighbors/ivf_flat.cuh>
+
 #include <optional>
 
 namespace faiss {
@@ -106,10 +106,6 @@ class RaftIVFFlat : public IVFFlat {
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Update the raft index
-    void set_index_(
-            std::optional<raft::neighbors::ivf_flat::index<float, idx_t>> idx);
-
     /// Filter out matrix rows containing NaN values
     void validRowIndices_(Tensor<float, 2, true>& vecs, bool* nan_flag);
 
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 4120f4ba73..109a5eaf22 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -348,11 +348,6 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     opt.dim = 3;
     opt.numQuery = 100000;
     queryTest(opt, faiss::METRIC_L2, false);
-
-#if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
-    queryTest(opt, faiss::METRIC_L2, false);
-#endif
 }
 
 // float16 coarse quantizer
@@ -558,15 +553,6 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
-    faiss::gpu::GpuIndexIVFFlatConfig config;
-    config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
-    config.flatConfig.useFloat16 = faiss::gpu::randBool();
-
-    faiss::gpu::GpuIndexIVFFlat gpuIndex(
-            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.nprobe = opt.nprobe;
-
     int numNans = 10;
     std::vector<float> nans(
             numNans * opt.dim, std::numeric_limits<float>::quiet_NaN());
@@ -578,6 +564,14 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     }
 
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = opt.indicesOpt;
+    config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
     gpuIndex.train(opt.numTrain, trainVecs.data());
 
     // should not crash
@@ -595,6 +589,26 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
             opt.k,
             distance.data(),
             indices.data());
+
+#if defined USE_NVIDIA_RAFT
+config.use_raft = true;
+faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    raftGpuIndex.nprobe = opt.nprobe;
+    raftGpuIndex.train(opt.numTrain, trainVecs.data());
+
+    // should not crash
+    EXPECT_EQ(raftGpuIndex.ntotal, 0);
+    raftGpuIndex.add(numNans, nans.data());
+
+    // should not crash
+    raftGpuIndex.search(
+            opt.numQuery,
+            queryVecs.data(),
+            opt.k,
+            distance.data(),
+            indices.data());
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
@@ -651,6 +665,25 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
             kF32MaxRelErr,
             0.1f,
             0.015f);
+
+#if defined USE_NVIDIA_RAFT
+config.use_raft = true;
+faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    raftGpuIndex.copyFrom(&cpuIndex);
+    raftGpuIndex.nprobe = nprobe;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            raftGpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, LongIVFList) {
@@ -709,6 +742,26 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
             kF32MaxRelErr,
             0.1f,
             0.015f);
+
+#if defined USE_NVIDIA_RAFT
+    config.use_raft = true;    
+    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    raftGpuIndex.train(numTrain, trainVecs.data());
+    raftGpuIndex.add(numAdd, addVecs.data());
+    raftGpuIndex.nprobe = 1;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            raftGpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
+#endif
 }
 
 int main(int argc, char** argv) {

From 3c33ebb98fa65e8c47a900b9acd41d3ea99b193b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Aug 2023 15:41:28 -0700
Subject: [PATCH 80/87] Formatting

---
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 17 ++++++++++-------
 faiss/gpu/test/TestUtils.cpp           | 26 --------------------------
 2 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 109a5eaf22..7f2ae81196 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -64,7 +64,7 @@ struct Options {
         str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
             << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
             << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt <<" use_raft "<< use_raft;
+            << indicesOpt << " use_raft " << use_raft;
 
         return str.str();
     }
@@ -133,7 +133,10 @@ void queryTest(
     }
 }
 
-void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer, bool use_raft) {
+void addTest(
+        faiss::MetricType metricType,
+        bool useFloat16CoarseQuantizer,
+        bool use_raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
 
@@ -591,8 +594,8 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
             indices.data());
 
 #if defined USE_NVIDIA_RAFT
-config.use_raft = true;
-faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    config.use_raft = true;
+    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.nprobe = opt.nprobe;
     raftGpuIndex.train(opt.numTrain, trainVecs.data());
@@ -667,8 +670,8 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
             0.015f);
 
 #if defined USE_NVIDIA_RAFT
-config.use_raft = true;
-faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    config.use_raft = true;
+    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.copyFrom(&cpuIndex);
     raftGpuIndex.nprobe = nprobe;
@@ -744,7 +747,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
             0.015f);
 
 #if defined USE_NVIDIA_RAFT
-    config.use_raft = true;    
+    config.use_raft = true;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.train(numTrain, trainVecs.data());
diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp
index 04f136782c..c81d34339e 100644
--- a/faiss/gpu/test/TestUtils.cpp
+++ b/faiss/gpu/test/TestUtils.cpp
@@ -114,32 +114,6 @@ void compareIndices(
             testDistance.data(),
             testIndices.data());
 
-    int start_idx = 17 * k;
-    int stop_idx = start_idx + k;
-    printf("ref inds: [");
-    for (int i = start_idx; i < stop_idx; i++) {
-        printf("%d, ", int(refIndices[i]));
-    }
-    printf("]\n");
-
-    printf("test inds: [");
-    for (int i = start_idx; i < stop_idx; i++) {
-        printf("%d, ", int(testIndices[i]));
-    }
-    printf("]\n");
-
-    printf("ref dists: [");
-    for (int i = start_idx; i < stop_idx; i++) {
-        printf("%f, ", float(refDistance[i]));
-    }
-    printf("]\n");
-
-    printf("test dists: [");
-    for (int i = start_idx; i < stop_idx; i++) {
-        printf("%f, ", float(testDistance[i]));
-    }
-    printf("]\n");
-
     faiss::gpu::compareLists(
             refDistance.data(),
             refIndices.data(),

From 971a6b2d1f2fa696a9561b7c87f563f22533a9e9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Aug 2023 15:45:03 -0700
Subject: [PATCH 81/87] Format

---
 faiss/gpu/test/TestGpuIndexFlat.cpp | 918 ++++++++++++++--------------
 1 file changed, 459 insertions(+), 459 deletions(-)

diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 2ab616caf4..6d9c83e547 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -1,9 +1,9 @@
 /**
-* Copyright (c) Facebook, Inc. and its affiliates.
-*
-* This source code is licensed under the MIT license found in the
-* LICENSE file in the root directory of this source tree.
-*/
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #include <faiss/IndexFlat.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -21,644 +21,644 @@ constexpr float kF16MaxRelErr = 0.07f;
 constexpr float kF32MaxRelErr = 6e-3f;
 
 struct TestFlatOptions {
-   TestFlatOptions()
-           : metric(faiss::MetricType::METRIC_L2),
-             metricArg(0),
-             useFloat16(false),
-             numVecsOverride(-1),
-             numQueriesOverride(-1),
-             kOverride(-1),
-             dimOverride(-1),
-             use_raft(false) {}
-
-   faiss::MetricType metric;
-   float metricArg;
-
-   bool useFloat16;
-   int numVecsOverride;
-   int numQueriesOverride;
-   int kOverride;
-   int dimOverride;
-   bool use_raft;
+    TestFlatOptions()
+            : metric(faiss::MetricType::METRIC_L2),
+              metricArg(0),
+              useFloat16(false),
+              numVecsOverride(-1),
+              numQueriesOverride(-1),
+              kOverride(-1),
+              dimOverride(-1),
+              use_raft(false) {}
+
+    faiss::MetricType metric;
+    float metricArg;
+
+    bool useFloat16;
+    int numVecsOverride;
+    int numQueriesOverride;
+    int kOverride;
+    int dimOverride;
+    bool use_raft;
 };
 
 void testFlat(const TestFlatOptions& opt) {
-   int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride
-                                         : faiss::gpu::randVal(1000, 5000);
-   int dim = opt.dimOverride > 0 ? opt.dimOverride
-                                 : faiss::gpu::randVal(50, 800);
-   int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride
-                                             : faiss::gpu::randVal(1, 512);
-
-   // Due to loss of precision in a float16 accumulator, for large k,
-   // the number of differences is pretty huge. Restrict ourselves to a
-   // fairly small `k` for float16
-   int k = opt.useFloat16
-           ? std::min(faiss::gpu::randVal(1, 50), numVecs)
-           : std::min(
-                     faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()),
-                     numVecs);
-   if (opt.kOverride > 0) {
-       k = opt.kOverride;
-   }
-
-   faiss::IndexFlat cpuIndex(dim, opt.metric);
-   cpuIndex.metric_arg = opt.metricArg;
-
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
-
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = device;
-   config.useFloat16 = opt.useFloat16;
-   config.use_raft = opt.use_raft;
-
-   faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
-   gpuIndex.metric_arg = opt.metricArg;
-
-   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-   cpuIndex.add(numVecs, vecs.data());
-   gpuIndex.add(numVecs, vecs.data());
-
-   std::stringstream str;
-   str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs "
-       << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16
-       << " numQuery " << numQuery << " k " << k;
-
-   // To some extent, we depend upon the relative error for the test
-   // for float16
-   faiss::gpu::compareIndices(
-           cpuIndex,
-           gpuIndex,
-           numQuery,
-           dim,
-           k,
-           str.str(),
-           opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
-           // FIXME: the fp16 bounds are
-           // useless when math (the accumulator) is
-           // in fp16. Figure out another way to test
-           opt.useFloat16 ? 0.99f : 0.1f,
-           opt.useFloat16 ? 0.65f : 0.015f);
+    int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride
+                                          : faiss::gpu::randVal(1000, 5000);
+    int dim = opt.dimOverride > 0 ? opt.dimOverride
+                                  : faiss::gpu::randVal(50, 800);
+    int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride
+                                              : faiss::gpu::randVal(1, 512);
+
+    // Due to loss of precision in a float16 accumulator, for large k,
+    // the number of differences is pretty huge. Restrict ourselves to a
+    // fairly small `k` for float16
+    int k = opt.useFloat16
+            ? std::min(faiss::gpu::randVal(1, 50), numVecs)
+            : std::min(
+                      faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()),
+                      numVecs);
+    if (opt.kOverride > 0) {
+        k = opt.kOverride;
+    }
+
+    faiss::IndexFlat cpuIndex(dim, opt.metric);
+    cpuIndex.metric_arg = opt.metricArg;
+
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = device;
+    config.useFloat16 = opt.useFloat16;
+    config.use_raft = opt.use_raft;
+
+    faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
+    gpuIndex.metric_arg = opt.metricArg;
+
+    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+    cpuIndex.add(numVecs, vecs.data());
+    gpuIndex.add(numVecs, vecs.data());
+
+    std::stringstream str;
+    str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs "
+        << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16
+        << " numQuery " << numQuery << " k " << k;
+
+    // To some extent, we depend upon the relative error for the test
+    // for float16
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            numQuery,
+            dim,
+            k,
+            str.str(),
+            opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            // FIXME: the fp16 bounds are
+            // useless when math (the accumulator) is
+            // in fp16. Figure out another way to test
+            opt.useFloat16 ? 0.99f : 0.1f,
+            opt.useFloat16 ? 0.65f : 0.015f);
 }
 
 TEST(TestGpuIndexFlat, IP_Float32) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-       opt.useFloat16 = false;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
+        opt.useFloat16 = false;
 
-       testFlat(opt);
+        testFlat(opt);
 
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 TEST(TestGpuIndexFlat, L1_Float32) {
-   TestFlatOptions opt;
-   opt.metric = faiss::MetricType::METRIC_L1;
-   opt.useFloat16 = false;
+    TestFlatOptions opt;
+    opt.metric = faiss::MetricType::METRIC_L1;
+    opt.useFloat16 = false;
 
-   testFlat(opt);
+    testFlat(opt);
 
 #if defined USE_NVIDIA_RAFT
-   opt.use_raft = true;
-   testFlat(opt);
+    opt.use_raft = true;
+    testFlat(opt);
 #endif
 }
 
 TEST(TestGpuIndexFlat, Lp_Float32) {
-   TestFlatOptions opt;
-   opt.metric = faiss::MetricType::METRIC_Lp;
-   opt.metricArg = 5;
-   opt.useFloat16 = false;
+    TestFlatOptions opt;
+    opt.metric = faiss::MetricType::METRIC_Lp;
+    opt.metricArg = 5;
+    opt.useFloat16 = false;
 
-   testFlat(opt);
+    testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-   opt.use_raft = true;
-   testFlat(opt);
+    opt.use_raft = true;
+    testFlat(opt);
 #endif
 }
 
 TEST(TestGpuIndexFlat, L2_Float32) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
 
-       opt.useFloat16 = false;
+        opt.useFloat16 = false;
 
-       testFlat(opt);
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 // At least one test for the k > 1024 select
 TEST(TestGpuIndexFlat, L2_k_2048) {
-   if (faiss::gpu::getMaxKSelection() >= 2048) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
-       opt.useFloat16 = false;
-       opt.kOverride = 2048;
-       opt.dimOverride = 128;
-       opt.numVecsOverride = 10000;
-
-       testFlat(opt);
+    if (faiss::gpu::getMaxKSelection() >= 2048) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = false;
+        opt.kOverride = 2048;
+        opt.dimOverride = 128;
+        opt.numVecsOverride = 10000;
+
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 // test specialized k == 1 codepath
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
-       opt.useFloat16 = false;
-       opt.kOverride = 1;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = false;
+        opt.kOverride = 1;
 
-       testFlat(opt);
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 TEST(TestGpuIndexFlat, IP_Float16) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
-       opt.useFloat16 = true;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
+        opt.useFloat16 = true;
 
-       testFlat(opt);
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 TEST(TestGpuIndexFlat, L2_Float16) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
-       opt.useFloat16 = true;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = true;
 
-       testFlat(opt);
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 // test specialized k == 1 codepath
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
-   for (int tries = 0; tries < 3; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
-       opt.useFloat16 = true;
-       opt.kOverride = 1;
+    for (int tries = 0; tries < 3; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = true;
+        opt.kOverride = 1;
 
-       testFlat(opt);
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 // test tiling along a huge vector set
 TEST(TestGpuIndexFlat, L2_Tiling) {
-   for (int tries = 0; tries < 2; ++tries) {
-       TestFlatOptions opt;
-       opt.metric = faiss::MetricType::METRIC_L2;
-       opt.useFloat16 = false;
-       opt.numVecsOverride = 1000000;
-
-       // keep the rest of the problem reasonably small
-       opt.numQueriesOverride = 4;
-       opt.dimOverride = 64;
-       opt.kOverride = 64;
-
-       testFlat(opt);
+    for (int tries = 0; tries < 2; ++tries) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = false;
+        opt.numVecsOverride = 1000000;
+
+        // keep the rest of the problem reasonably small
+        opt.numQueriesOverride = 4;
+        opt.dimOverride = 64;
+        opt.kOverride = 64;
+
+        testFlat(opt);
 #if defined USE_NVIDIA_RAFT
-       opt.use_raft = true;
-       testFlat(opt);
+        opt.use_raft = true;
+        testFlat(opt);
 #endif
-   }
+    }
 }
 
 TEST(TestGpuIndexFlat, QueryEmpty) {
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = 0;
-   config.useFloat16 = false;
-   int dim = 128;
-   faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = 0;
+    config.useFloat16 = false;
+    int dim = 128;
+    faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
-   // Querying an empty index should not blow up, and just return
-   // (FLT_MAX, -1)
-   int numQuery = 10;
-   int k = 50;
-   std::vector<float> queries(numQuery * dim, 1.0f);
+    // Querying an empty index should not blow up, and just return
+    // (FLT_MAX, -1)
+    int numQuery = 10;
+    int k = 50;
+    std::vector<float> queries(numQuery * dim, 1.0f);
 
-   std::vector<float> dist(numQuery * k, 0);
-   std::vector<faiss::idx_t> ind(numQuery * k);
+    std::vector<float> dist(numQuery * k, 0);
+    std::vector<faiss::idx_t> ind(numQuery * k);
 
-   gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
+    gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
 
-   for (auto d : dist) {
-       EXPECT_EQ(d, std::numeric_limits<float>::max());
-   }
+    for (auto d : dist) {
+        EXPECT_EQ(d, std::numeric_limits<float>::max());
+    }
 
-   for (auto i : ind) {
-       EXPECT_EQ(i, -1);
-   }
+    for (auto i : ind) {
+        EXPECT_EQ(i, -1);
+    }
 }
 
 void testCopyFrom(bool use_raft) {
-   int numVecs = faiss::gpu::randVal(100, 200);
-   int dim = faiss::gpu::randVal(1, 1000);
+    int numVecs = faiss::gpu::randVal(100, 200);
+    int dim = faiss::gpu::randVal(1, 1000);
 
-   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
 
-   faiss::IndexFlatL2 cpuIndex(dim);
-   cpuIndex.add(numVecs, vecs.data());
+    faiss::IndexFlatL2 cpuIndex(dim);
+    cpuIndex.add(numVecs, vecs.data());
 
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-   for (bool useFloat16 : {false, true}) {
-       faiss::gpu::GpuIndexFlatConfig config;
-       config.device = device;
-       config.useFloat16 = useFloat16;
-       config.use_raft = use_raft;
+    for (bool useFloat16 : {false, true}) {
+        faiss::gpu::GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.use_raft = use_raft;
 
-       // Fill with garbage values
-       faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
-       gpuIndex.copyFrom(&cpuIndex);
+        // Fill with garbage values
+        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
+        gpuIndex.copyFrom(&cpuIndex);
 
-       EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-       EXPECT_EQ(gpuIndex.ntotal, numVecs);
+        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+        EXPECT_EQ(gpuIndex.ntotal, numVecs);
 
-       EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-       EXPECT_EQ(cpuIndex.d, dim);
+        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+        EXPECT_EQ(cpuIndex.d, dim);
 
-       std::vector<float> gpuVals(numVecs * dim);
-       gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+        std::vector<float> gpuVals(numVecs * dim);
+        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
 
-       std::vector<float> cpuVals(numVecs * dim);
-       cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
+        std::vector<float> cpuVals(numVecs * dim);
+        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
 
-       // The CPU is the source of (float32) truth here, while the GPU index
-       // may be in float16 mode and thus was subject to rounding
-       if (useFloat16) {
-           EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals));
-       } else {
-           // Should be exactly the same
-           EXPECT_EQ(gpuVals, cpuVals);
-       }
-   }
+        // The CPU is the source of (float32) truth here, while the GPU index
+        // may be in float16 mode and thus was subject to rounding
+        if (useFloat16) {
+            EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals));
+        } else {
+            // Should be exactly the same
+            EXPECT_EQ(gpuVals, cpuVals);
+        }
+    }
 }
 
 TEST(TestGpuIndexFlat, CopyFrom) {
-   testCopyFrom(false);
+    testCopyFrom(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, CopyFrom) {
-   testCopyFrom(true);
+    testCopyFrom(true);
 }
 #endif
 
 void testCopyTo(bool use_raft) {
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   int numVecs = faiss::gpu::randVal(100, 200);
-   int dim = faiss::gpu::randVal(1, 1000);
+    int numVecs = faiss::gpu::randVal(100, 200);
+    int dim = faiss::gpu::randVal(1, 1000);
 
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
 
-   for (bool useFloat16 : {false, true}) {
-       faiss::gpu::GpuIndexFlatConfig config;
-       config.device = device;
-       config.useFloat16 = useFloat16;
-       config.use_raft = use_raft;
+    for (bool useFloat16 : {false, true}) {
+        faiss::gpu::GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.use_raft = use_raft;
 
-       faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
-       gpuIndex.add(numVecs, vecs.data());
+        faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
+        gpuIndex.add(numVecs, vecs.data());
 
-       // Fill with garbage values
-       faiss::IndexFlatL2 cpuIndex(2000);
-       gpuIndex.copyTo(&cpuIndex);
+        // Fill with garbage values
+        faiss::IndexFlatL2 cpuIndex(2000);
+        gpuIndex.copyTo(&cpuIndex);
 
-       EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
-       EXPECT_EQ(gpuIndex.ntotal, numVecs);
+        EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal);
+        EXPECT_EQ(gpuIndex.ntotal, numVecs);
 
-       EXPECT_EQ(cpuIndex.d, gpuIndex.d);
-       EXPECT_EQ(cpuIndex.d, dim);
+        EXPECT_EQ(cpuIndex.d, gpuIndex.d);
+        EXPECT_EQ(cpuIndex.d, dim);
 
-       std::vector<float> gpuVals(numVecs * dim);
-       gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+        std::vector<float> gpuVals(numVecs * dim);
+        gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
 
-       std::vector<float> cpuVals(numVecs * dim);
-       cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
+        std::vector<float> cpuVals(numVecs * dim);
+        cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data());
 
-       // The GPU is the source of truth here, so the float32 exact comparison
-       // even if the index uses float16 is ok
-       EXPECT_EQ(gpuVals, cpuVals);
-   }
+        // The GPU is the source of truth here, so the float32 exact comparison
+        // even if the index uses float16 is ok
+        EXPECT_EQ(gpuVals, cpuVals);
+    }
 }
 
 TEST(TestGpuIndexFlat, CopyTo) {
-   testCopyTo(false);
+    testCopyTo(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, CopyTo) {
-   testCopyTo(true);
+    testCopyTo(true);
 }
 #endif
 
 void testUnifiedMemory(bool use_raft) {
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-   if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
-       return;
-   }
+    if (!faiss::gpu::getFullUnifiedMemSupport(device)) {
+        return;
+    }
 
-   int dim = 256;
+    int dim = 256;
 
-   // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
-   // kernel indexing, so we can't test unified memory for memory
-   // oversubscription.
-   size_t numVecs = 50000;
-   int numQuery = 10;
-   int k = 10;
+    // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to
+    // kernel indexing, so we can't test unified memory for memory
+    // oversubscription.
+    size_t numVecs = 50000;
+    int numQuery = 10;
+    int k = 10;
 
-   faiss::IndexFlatL2 cpuIndexL2(dim);
+    faiss::IndexFlatL2 cpuIndexL2(dim);
 
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = device;
-   config.memorySpace = faiss::gpu::MemorySpace::Unified;
-   config.use_raft = use_raft;
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = device;
+    config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = use_raft;
 
-   faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
 
-   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-   cpuIndexL2.add(numVecs, vecs.data());
-   gpuIndexL2.add(numVecs, vecs.data());
+    std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
+    cpuIndexL2.add(numVecs, vecs.data());
+    gpuIndexL2.add(numVecs, vecs.data());
 
-   // To some extent, we depend upon the relative error for the test
-   // for float16
-   faiss::gpu::compareIndices(
-           cpuIndexL2,
-           gpuIndexL2,
-           numQuery,
-           dim,
-           k,
-           "Unified Memory",
-           kF32MaxRelErr,
-           0.1f,
-           0.015f);
+    // To some extent, we depend upon the relative error for the test
+    // for float16
+    faiss::gpu::compareIndices(
+            cpuIndexL2,
+            gpuIndexL2,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
 }
 
 TEST(TestGpuIndexFlat, UnifiedMemory) {
-   testUnifiedMemory(false);
+    testUnifiedMemory(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, UnifiedMemory) {
-   testUnifiedMemory(true);
+    testUnifiedMemory(true);
 }
 #endif
 
 void testLargeIndex(bool use_raft) {
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   // Skip this device if we do not have sufficient memory
-   constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
+    // Skip this device if we do not have sufficient memory
+    constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
 
-   if (faiss::gpu::getFreeMemory(device) < kMem) {
-       std::cout << "TestGpuIndexFlat.LargeIndex: skipping due "
-                    "to insufficient device memory\n";
-       return;
-   }
+    if (faiss::gpu::getFreeMemory(device) < kMem) {
+        std::cout << "TestGpuIndexFlat.LargeIndex: skipping due "
+                     "to insufficient device memory\n";
+        return;
+    }
 
-   std::cout << "Running LargeIndex test\n";
+    std::cout << "Running LargeIndex test\n";
 
-   size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size
-   size_t nb = 5000000;
-   size_t nq = 10;
+    size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size
+    size_t nb = 5000000;
+    size_t nq = 10;
 
-   auto xb = faiss::gpu::randVecs(nb, dim);
+    auto xb = faiss::gpu::randVecs(nb, dim);
 
-   int k = 10;
+    int k = 10;
 
-   faiss::IndexFlatL2 cpuIndexL2(dim);
+    faiss::IndexFlatL2 cpuIndexL2(dim);
 
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = device;
-   config.use_raft = use_raft;
-   faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = device;
+    config.use_raft = use_raft;
+    faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
 
-   cpuIndexL2.add(nb, xb.data());
-   gpuIndexL2.add(nb, xb.data());
+    cpuIndexL2.add(nb, xb.data());
+    gpuIndexL2.add(nb, xb.data());
 
-   // To some extent, we depend upon the relative error for the test
-   // for float16
-   faiss::gpu::compareIndices(
-           cpuIndexL2,
-           gpuIndexL2,
-           nq,
-           dim,
-           k,
-           "LargeIndex",
-           kF32MaxRelErr,
-           0.1f,
-           0.015f);
+    // To some extent, we depend upon the relative error for the test
+    // for float16
+    faiss::gpu::compareIndices(
+            cpuIndexL2,
+            gpuIndexL2,
+            nq,
+            dim,
+            k,
+            "LargeIndex",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
 }
 
 TEST(TestGpuIndexFlat, LargeIndex) {
-   testLargeIndex(false);
+    testLargeIndex(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, LargeIndex) {
-   testLargeIndex(true);
+    testLargeIndex(true);
 }
 #endif
 
 void testResidual(bool use_raft) {
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
 
-   faiss::gpu::GpuIndexFlatConfig config;
-   config.device = device;
-   config.use_raft = use_raft;
+    faiss::gpu::GpuIndexFlatConfig config;
+    config.device = device;
+    config.use_raft = use_raft;
 
-   int dim = 32;
-   faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
-   faiss::gpu::GpuIndexFlat gpuIndex(
-           &res, dim, faiss::MetricType::METRIC_L2, config);
+    int dim = 32;
+    faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
+    faiss::gpu::GpuIndexFlat gpuIndex(
+            &res, dim, faiss::MetricType::METRIC_L2, config);
 
-   int numVecs = 100;
-   auto vecs = faiss::gpu::randVecs(numVecs, dim);
-   cpuIndex.add(numVecs, vecs.data());
-   gpuIndex.add(numVecs, vecs.data());
+    int numVecs = 100;
+    auto vecs = faiss::gpu::randVecs(numVecs, dim);
+    cpuIndex.add(numVecs, vecs.data());
+    gpuIndex.add(numVecs, vecs.data());
 
-   auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
-   auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
+    auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
+    auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
 
-   auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
-   auto residualsGpu = std::vector<float>(indexVecs.size() * dim);
+    auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
+    auto residualsGpu = std::vector<float>(indexVecs.size() * dim);
 
-   cpuIndex.compute_residual_n(
-           indexVecs.size(),
-           queryVecs.data(),
-           residualsCpu.data(),
-           indexVecs.data());
-   gpuIndex.compute_residual_n(
-           indexVecs.size(),
-           queryVecs.data(),
-           residualsGpu.data(),
-           indexVecs.data());
+    cpuIndex.compute_residual_n(
+            indexVecs.size(),
+            queryVecs.data(),
+            residualsCpu.data(),
+            indexVecs.data());
+    gpuIndex.compute_residual_n(
+            indexVecs.size(),
+            queryVecs.data(),
+            residualsGpu.data(),
+            indexVecs.data());
 
-   // Should be exactly the same, as this is just a single float32 subtraction
-   EXPECT_EQ(residualsCpu, residualsGpu);
+    // Should be exactly the same, as this is just a single float32 subtraction
+    EXPECT_EQ(residualsCpu, residualsGpu);
 }
 
 TEST(TestGpuIndexFlat, Residual) {
-   testResidual(false);
+    testResidual(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, Residual) {
-   testResidual(true);
+    testResidual(true);
 }
 #endif
 
 void testReconstruct(bool use_raft) {
-   // Construct on a random device to test multi-device, if we have
-   // multiple devices
-   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-
-   faiss::gpu::StandardGpuResources res;
-   res.noTempMemory();
-
-   int dim = 32;
-   int numVecs = 100;
-   auto vecs = faiss::gpu::randVecs(numVecs, dim);
-   auto vecs16 = faiss::gpu::roundToHalf(vecs);
-
-   for (bool useFloat16 : {false, true}) {
-       faiss::gpu::GpuIndexFlatConfig config;
-       config.device = device;
-       config.useFloat16 = useFloat16;
-       config.use_raft = use_raft;
-
-       faiss::gpu::GpuIndexFlat gpuIndex(
-               &res, dim, faiss::MetricType::METRIC_L2, config);
-
-       gpuIndex.add(numVecs, vecs.data());
-
-       // Test reconstruct
-       {
-           auto reconstructVecs = std::vector<float>(dim);
-           gpuIndex.reconstruct(15, reconstructVecs.data());
-
-           auto& ref = useFloat16 ? vecs16 : vecs;
-
-           for (int i = 0; i < dim; ++i) {
-               EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]);
-           }
-       }
-
-       // Test reconstruct_n
-       if (false) {
-           auto reconstructVecs = std::vector<float>((numVecs - 1) * dim);
-
-           int startVec = 5;
-           int endVec = numVecs - 1;
-           int numReconstructVec = endVec - startVec + 1;
-
-           gpuIndex.reconstruct_n(
-                   startVec, numReconstructVec, reconstructVecs.data());
-
-           auto& ref = useFloat16 ? vecs16 : vecs;
-
-           for (int i = 0; i < numReconstructVec; ++i) {
-               for (int j = 0; j < dim; ++j) {
-                   EXPECT_EQ(
-                           reconstructVecs[i * dim + j],
-                           ref[(i + startVec) * dim + j]);
-               }
-           }
-       }
-
-       // Test reconstruct_batch
-       if (false) {
-           auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
-           auto reconstructVecs =
-                   std::vector<float>(reconstructKeys.size() * dim);
-
-           gpuIndex.reconstruct_batch(
-                   reconstructKeys.size(),
-                   reconstructKeys.data(),
-                   reconstructVecs.data());
-
-           auto& ref = useFloat16 ? vecs16 : vecs;
-
-           for (int i = 0; i < reconstructKeys.size(); ++i) {
-               for (int j = 0; j < dim; ++j) {
-                   EXPECT_EQ(
-                           reconstructVecs[i * dim + j],
-                           ref[reconstructKeys[i] * dim + j]);
-               }
-           }
-       }
-   }
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    int dim = 32;
+    int numVecs = 100;
+    auto vecs = faiss::gpu::randVecs(numVecs, dim);
+    auto vecs16 = faiss::gpu::roundToHalf(vecs);
+
+    for (bool useFloat16 : {false, true}) {
+        faiss::gpu::GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.use_raft = use_raft;
+
+        faiss::gpu::GpuIndexFlat gpuIndex(
+                &res, dim, faiss::MetricType::METRIC_L2, config);
+
+        gpuIndex.add(numVecs, vecs.data());
+
+        // Test reconstruct
+        {
+            auto reconstructVecs = std::vector<float>(dim);
+            gpuIndex.reconstruct(15, reconstructVecs.data());
+
+            auto& ref = useFloat16 ? vecs16 : vecs;
+
+            for (int i = 0; i < dim; ++i) {
+                EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]);
+            }
+        }
+
+        // Test reconstruct_n
+        if (false) {
+            auto reconstructVecs = std::vector<float>((numVecs - 1) * dim);
+
+            int startVec = 5;
+            int endVec = numVecs - 1;
+            int numReconstructVec = endVec - startVec + 1;
+
+            gpuIndex.reconstruct_n(
+                    startVec, numReconstructVec, reconstructVecs.data());
+
+            auto& ref = useFloat16 ? vecs16 : vecs;
+
+            for (int i = 0; i < numReconstructVec; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                    EXPECT_EQ(
+                            reconstructVecs[i * dim + j],
+                            ref[(i + startVec) * dim + j]);
+                }
+            }
+        }
+
+        // Test reconstruct_batch
+        if (false) {
+            auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
+            auto reconstructVecs =
+                    std::vector<float>(reconstructKeys.size() * dim);
+
+            gpuIndex.reconstruct_batch(
+                    reconstructKeys.size(),
+                    reconstructKeys.data(),
+                    reconstructVecs.data());
+
+            auto& ref = useFloat16 ? vecs16 : vecs;
+
+            for (int i = 0; i < reconstructKeys.size(); ++i) {
+                for (int j = 0; j < dim; ++j) {
+                    EXPECT_EQ(
+                            reconstructVecs[i * dim + j],
+                            ref[reconstructKeys[i] * dim + j]);
+                }
+            }
+        }
+    }
 }
 
 TEST(TestGpuIndexFlat, Reconstruct) {
-   testReconstruct(false);
+    testReconstruct(false);
 }
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, Reconstruct) {
-   testReconstruct(true);
+    testReconstruct(true);
 }
 #endif
 
@@ -750,20 +750,20 @@ void testSearchAndReconstruct(bool use_raft) {
     }
 }
 TEST(TestGpuIndexFlat, SearchAndReconstruct) {
-   testSearchAndReconstruct(false);
+    testSearchAndReconstruct(false);
 }
 
 #if defined USE_NVIDIA_RAFT
 TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) {
-   testSearchAndReconstruct(true);
+    testSearchAndReconstruct(true);
 }
 #endif
 
 int main(int argc, char** argv) {
-   testing::InitGoogleTest(&argc, argv);
+    testing::InitGoogleTest(&argc, argv);
 
-   // just run with a fixed test seed
-   faiss::gpu::setTestSeed(100);
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
 
-   return RUN_ALL_TESTS();
+    return RUN_ALL_TESTS();
 }
\ No newline at end of file

From 5c0592ec338bee6ff194b50b86402851189d7c90 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Aug 2023 16:06:57 -0700
Subject: [PATCH 82/87] remove debug statements

---
 faiss/gpu/GpuIndex.cu     | 1 -
 faiss/gpu/impl/IVFBase.cu | 2 --
 2 files changed, 3 deletions(-)

diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 749bec221f..89952b1121 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -77,7 +77,6 @@ int GpuIndex::getDevice() const {
 }
 
 void GpuIndex::copyFrom(const faiss::Index* index) {
-    printf("inside gpuindex copyFrom\n");
     d = index->d;
     metric_type = index->metric_type;
     metric_arg = index->metric_arg;
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 6aef83ef3f..890d489440 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -106,7 +106,6 @@ void IVFBase::reserveMemory(idx_t numVecs) {
 }
 
 void IVFBase::reset() {
-    printf("inside ivfbase::reset\n");
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     deviceListData_.clear();
@@ -324,7 +323,6 @@ std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
 }
 
 void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
-    printf("Inside IVFBase's copyInvertedListsFrom\n");
     idx_t nlist = ivf ? ivf->nlist : 0;
     for (idx_t i = 0; i < nlist; ++i) {
         addEncodedVectorsToList_(

From 7618b44951b2e0818495d85436e36d6b43f32520 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 23 Aug 2023 09:40:21 -0700
Subject: [PATCH 83/87] LargeBatch test added and now passing

---
 build.sh                               | 5 +++++
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/build.sh b/build.sh
index 6a353379f8..bb9985ce25 100755
--- a/build.sh
+++ b/build.sh
@@ -50,4 +50,9 @@ cmake \
  ${EXTRA_CMAKE_ARGS} \
  ../
 
+
+# make -C build -j12 faiss
 cmake  --build . -j12
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 7f2ae81196..821fbe1159 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -351,6 +351,11 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     opt.dim = 3;
     opt.numQuery = 100000;
     queryTest(opt, faiss::METRIC_L2, false);
+
+#if defined USE_NVIDIA_RAFT
+    opt.use_raft = true;
+    queryTest(opt, faiss::METRIC_L2, false);
+#endif
 }
 
 // float16 coarse quantizer

From bd5a217892541417da89f5acd4c1332b0865c307 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 23 Aug 2023 13:43:51 -0700
Subject: [PATCH 84/87] final update to gtests

---
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 76 +++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 14 deletions(-)

diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 821fbe1159..b92322d9a8 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -476,6 +476,14 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
+    // Construct a positive test set
+    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+    // Put all vecs on positive size
+    for (auto& f : queryVecs) {
+        f = std::abs(f);
+    }
+
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
@@ -485,14 +493,6 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     gpuIndex.copyFrom(&cpuIndex);
     gpuIndex.nprobe = opt.nprobe;
 
-    // Construct a positive test set
-    auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
-
-    // Put all vecs on positive size
-    for (auto& f : queryVecs) {
-        f = std::abs(f);
-    }
-
     bool compFloat16 = false;
     faiss::gpu::compareIndices(
             queryVecs,
@@ -508,6 +508,30 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
             // in fp16. Figure out another way to test
             compFloat16 ? 0.99f : 0.1f,
             compFloat16 ? 0.65f : 0.015f);
+
+#if defined USE_NVIDIA_RAFT
+    config.use_raft = true;
+
+    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+            &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
+    raftGpuIndex.copyFrom(&cpuIndex);
+    raftGpuIndex.nprobe = opt.nprobe;
+
+    faiss::gpu::compareIndices(
+            queryVecs,
+            cpuIndex,
+            raftGpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            compFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
+            // FIXME: the fp16 bounds are
+            // useless when math (the accumulator) is
+            // in fp16. Figure out another way to test
+            compFloat16 ? 0.99f : 0.1f,
+            compFloat16 ? 0.65f : 0.015f);
+#endif
 }
 
 //
@@ -523,6 +547,13 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
+    int numQuery = 10;
+    std::vector<float> nans(
+            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+
+    std::vector<float> distances(numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
+
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
@@ -535,14 +566,30 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
 
-    int numQuery = 10;
-    std::vector<float> nans(
-            numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
+    gpuIndex.search(
+            numQuery, nans.data(), opt.k, distances.data(), indices.data());
 
-    std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
+    for (int q = 0; q < numQuery; ++q) {
+        for (int k = 0; k < opt.k; ++k) {
+            EXPECT_EQ(indices[q * opt.k + k], -1);
+            EXPECT_EQ(
+                    distances[q * opt.k + k],
+                    std::numeric_limits<float>::max());
+        }
+    }
 
-    gpuIndex.search(
+#if defined USE_NVIDIA_RAFT
+    config.use_raft = true;
+    std::fill(distances.begin(), distances.end(), 0);
+    std::fill(indices.begin(), indices.end(), 0);
+    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    raftGpuIndex.nprobe = opt.nprobe;
+
+    raftGpuIndex.train(opt.numTrain, trainVecs.data());
+    raftGpuIndex.add(opt.numAdd, addVecs.data());
+
+    raftGpuIndex.search(
             numQuery, nans.data(), opt.k, distances.data(), indices.data());
 
     for (int q = 0; q < numQuery; ++q) {
@@ -553,6 +600,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
                     std::numeric_limits<float>::max());
         }
     }
+#endif
 }
 
 TEST(TestGpuIndexIVFFlat, AddNaN) {

From 2022a1474afca28b9eeba37d54ca7178c0b4bdd3 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 6 Sep 2023 10:03:36 -0700
Subject: [PATCH 85/87] Pull latest

---
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index b92322d9a8..d042292aef 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -202,7 +202,6 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    printf("opt.numCentroids %d", opt.numCentroids);
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
     gpuIndex.nprobe = opt.nprobe;

From e441ce5772bb905be7cd64bb6e38eb111277010f Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 6 Sep 2023 13:17:04 -0700
Subject: [PATCH 86/87] IndicesOptions assertion

---
 faiss/gpu/impl/RaftIVFFlat.cu          |  7 +++++--
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 21 ++++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 3861a2283c..2c6afb795c 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -73,6 +73,9 @@ RaftIVFFlat::RaftIVFFlat(
                   interleavedLayout,
                   indicesOptions,
                   space) {
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT index");
     reset();
 }
 
@@ -159,6 +162,8 @@ idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
+    /// TODO: We probably don't want to ignore the coarse quantizer here
+
     idx_t n_rows = vecs.getSize(0);
 
     const raft::device_resources& raft_handle =
@@ -207,8 +212,6 @@ idx_t RaftIVFFlat::addVectors(
                 raft::make_const_mdspan(gather_indices.view()));
     }
 
-    /// TODO: We probably don't want to ignore the coarse quantizer here
-
     FAISS_ASSERT(raft_knn_index.has_value());
     raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
             raft_handle,
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index d042292aef..8af86c2876 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -30,6 +30,7 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
+#include "faiss/gpu/GpuIndicesOptions.h"
 
 // FIXME: figure out a better way to test fp16
 constexpr float kF16MaxRelErr = 0.3f;
@@ -160,7 +161,7 @@ void addTest(
 
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
-        config.indicesOptions = opt.indicesOpt;
+        config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.use_raft = use_raft;
 
@@ -196,7 +197,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
+    config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
     config.use_raft = use_raft;
 
@@ -256,7 +257,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
-    config.indicesOptions = opt.indicesOpt;
+    config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
     config.use_raft = use_raft;
 
@@ -331,6 +332,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
 }
@@ -341,6 +343,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
 }
@@ -353,6 +356,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
 }
@@ -365,6 +369,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
 #endif
 }
@@ -375,6 +380,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 #endif
 }
@@ -391,6 +397,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
 }
@@ -402,6 +409,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
 }
@@ -413,6 +421,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
 }
@@ -424,6 +433,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
 
 #if defined USE_NVIDIA_RAFT
     opt.use_raft = true;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
 }
@@ -510,6 +520,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
 
 #if defined USE_NVIDIA_RAFT
     config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -579,6 +590,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
 
 #if defined USE_NVIDIA_RAFT
     config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     std::fill(distances.begin(), distances.end(), 0);
     std::fill(indices.begin(), indices.end(), 0);
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
@@ -647,6 +659,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
 
 #if defined USE_NVIDIA_RAFT
     config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.nprobe = opt.nprobe;
@@ -723,6 +736,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
 
 #if defined USE_NVIDIA_RAFT
     config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.copyFrom(&cpuIndex);
@@ -800,6 +814,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
 #if defined USE_NVIDIA_RAFT
     config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
     raftGpuIndex.train(numTrain, trainVecs.data());

From a0457bdfbed800081a0b39c9e8245378ae1c5d2d Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 7 Sep 2023 08:44:54 -0700
Subject: [PATCH 87/87] checks passing

---
 faiss/gpu/GpuIndexIVF.h                | 6 ------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 9 ++++++---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index efd8bfc755..a9f092d35b 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -92,12 +92,6 @@ class GpuIndexIVF : public GpuIndex, public IndexIVFInterface {
     /// debugging purposes.
     virtual std::vector<idx_t> getListIndices(idx_t listId) const;
 
-    /// Sets the number of list probes per query
-    void setNumProbes(int nprobe);
-
-    /// Returns our current number of list probes per query
-    int getNumProbes() const;
-
     void search_preassigned(
             idx_t n,
             const float* x,
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 8af86c2876..9fb88e2687 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -161,7 +161,8 @@ void addTest(
 
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
-        config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+        config.indicesOptions =
+                use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.use_raft = use_raft;
 
@@ -197,7 +198,8 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
-    config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+    config.indicesOptions =
+            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
     config.use_raft = use_raft;
 
@@ -257,7 +259,8 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
-    config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+    config.indicesOptions =
+            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
     config.use_raft = use_raft;