From 861d194fefbe602c8c310824a0e8e1f6aae2f752 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 8 Jan 2019 11:31:46 -0500 Subject: [PATCH 01/87] For #669. Adding install target to gpu Makefile --- gpu/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/gpu/Makefile b/gpu/Makefile index 4714eda7c7..072e089ddd 100644 --- a/gpu/Makefile +++ b/gpu/Makefile @@ -85,4 +85,12 @@ depend: $(CXXCPP) $(CPPFLAGS) -x c++ -MM $$i; \ done > depend +install: libgpufaiss.a libgpufaiss.$(SHAREDEXT) installdirs + cp libgpufaiss.a libgpufaiss.$(SHAREDEXT) $(DESTDIR)$(libdir) + cp *.h $(DESTDIR)$(includedir)/faiss/gpu + cp --parents **/**.h $(DESTDIR)$(includedir)/faiss/gpu + +installdirs: + $(MKDIR_P) $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss/gpu + .PHONY: all clean From 60d654fd29ced2d9002d456535a08871e0c9b8e9 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 3 Jun 2022 09:10:13 -0400 Subject: [PATCH 02/87] Starting integration of raft --- CMakeLists.txt | 18 +++ cmake/thirdparty/get_raft.cmake | 51 +++++++ faiss/gpu/CMakeLists.txt | 7 +- faiss/gpu/raft/RaftIndexIVFFlat.cu | 223 +++++++++++++++++++++++++++++ faiss/gpu/raft/RaftIndexIVFFlat.h | 48 +++++++ 5 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 cmake/thirdparty/get_raft.cmake create mode 100644 faiss/gpu/raft/RaftIndexIVFFlat.cu create mode 100644 faiss/gpu/raft/RaftIndexIVFFlat.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a7b1fc6ce3..71a05ab7dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,18 @@ cmake_minimum_required(VERSION 3.17 FATAL_ERROR) +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(rapids-cmake) +include(rapids-cpm) +include(rapids-cuda) +include(rapids-export) +include(rapids-find) + +rapids_cuda_init_architectures(faiss) + + project(faiss VERSION 1.6.4 DESCRIPTION "A library for efficient similarity search and clustering of dense vectors." @@ -20,6 +32,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Valid values are "generic", "avx2". option(FAISS_OPT_LEVEL "" "generic") option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) +option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) option(FAISS_ENABLE_PYTHON "Build Python extension." ON) option(FAISS_ENABLE_C_API "Build C API." OFF) @@ -28,6 +41,11 @@ if(FAISS_ENABLE_GPU) enable_language(CUDA) endif() +if(FAISS_ENABLE_RAFT) + rapids_cpm_init() + include(cmake/thirdparty/get_raft.cmake) +endif() + add_subdirectory(faiss) if(FAISS_ENABLE_GPU) diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake new file mode 100644 index 0000000000..b171570137 --- /dev/null +++ b/cmake/thirdparty/get_raft.cmake @@ -0,0 +1,51 @@ +#============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + + +set(RAFT_VERSION "22.04") +set(RAFT_FORK "rapidsai") +set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}") + +function(find_and_configure_raft) + set(oneValueArgs VERSION FORK PINNED_TAG) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + #----------------------------------------------------- + # Invoke CPM find_package() + #----------------------------------------------------- + rapids_cpm_find(raft ${PKG_VERSION} + GLOBAL_TARGETS raft::raft + BUILD_EXPORT_SET projname-exports + INSTALL_EXPORT_SET projname-exports + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git + GIT_TAG ${PKG_PINNED_TAG} + SOURCE_SUBDIR cpp + OPTIONS + "BUILD_TESTS OFF" + "BUILD_BENCH OFF" + "RAFT_COMPILE_LIBRARIES OFF" + ) +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different RAFT locally, set the CMake variable +# CPM_raft_SOURCE=/path/to/local/raft +find_and_configure_raft(VERSION ${RAFT_VERSION}.00 + FORK ${RAFT_FORK} + PINNED_TAG ${RAFT_PINNED_TAG} + ) \ No newline at end of file diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 16af761a03..30a45a7cbd 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -162,6 +162,11 @@ set(FAISS_GPU_HEADERS utils/warpselect/WarpSelectImpl.cuh ) +if(FAISS_ENABLE_RAFT) + list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h) + list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu) +endif() + # Export FAISS_GPU_HEADERS variable to parent scope. set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE) @@ -176,7 +181,7 @@ foreach(header ${FAISS_GPU_HEADERS}) endforeach() find_package(CUDAToolkit REQUIRED) -target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas) +target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft>) target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas) target_compile_options(faiss PRIVATE $<$:-Xfatbin=-compress-all>) target_compile_options(faiss_avx2 PRIVATE $<$:-Xfatbin=-compress-all>) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu new file mode 100644 index 0000000000..b411e0180a --- /dev/null +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -0,0 +1,223 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace faiss { +namespace gpu { + +RaftIndexIVFFlat::RaftIndexIVFFlat( + GpuResourcesProvider* provider, + const faiss::IndexIVFFlat* index, + GpuIndexIVFFlatConfig config) + : GpuIndexIVF( + provider, + index->d, + index->metric_type, + index->metric_arg, + index->nlist, + config), + ivfFlatConfig_(config), + reserveMemoryVecs_(0) { + copyFrom(index); +} + +RaftIndexIVFFlat::RaftIndexIVFFlat( + GpuResourcesProvider* provider, + int dims, + int nlist, + faiss::MetricType metric, + GpuIndexIVFFlatConfig config) + : GpuIndexIVF(provider, dims, metric, 0, nlist, config), + ivfFlatConfig_(config), + reserveMemoryVecs_(0) { + // faiss::Index params + this->is_trained = false; + + // We haven't trained ourselves, so don't construct the IVFFlat + // index yet +} + +RaftIndexIVFFlat::~RaftIndexIVFFlat() {} + +void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { + reserveMemoryVecs_ = numVecs; + if (index_) { + DeviceScope scope(config_.device); + index_->reserveMemory(numVecs); + } +} + +void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { + DeviceScope scope(config_.device); + + GpuIndexIVF::copyFrom(index); + + // Clear out our old data + index_.reset(); + + // The other index might not be trained + if (!index->is_trained) { + FAISS_ASSERT(!is_trained); + return; + } + + // Otherwise, we can populate ourselves from the other index + FAISS_ASSERT(is_trained); + + // Copy our lists as well + index_.reset(new IVFFlat( + resources_.get(), + quantizer->getGpuData(), + index->metric_type, + index->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace)); + + // Copy all of the IVF data + index_->copyInvertedListsFrom(index->invlists); +} + +void RaftIndexIVFFlat::reset() { + if (index_) { + DeviceScope scope(config_.device); + + index_->reset(); + this->ntotal = 0; + } else { + FAISS_ASSERT(this->ntotal == 0); + } +} + +void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { + // For now, only support <= max int results + FAISS_THROW_IF_NOT_FMT( + n <= (Index::idx_t)std::numeric_limits::max(), + "GPU index only supports up to %d indices", + std::numeric_limits::max()); + + DeviceScope scope(config_.device); + + if (this->is_trained) { + FAISS_ASSERT(quantizer->is_trained); + FAISS_ASSERT(quantizer->ntotal == nlist); + FAISS_ASSERT(index_); + return; + } + + FAISS_ASSERT(!index_); + + // FIXME: GPUize more of this + // First, make sure that the data is resident on the CPU, if it is not on + // the CPU, as we depend upon parts of the CPU code + auto hostData = toHost( + (float*)x, + resources_->getDefaultStream(config_.device), + {(int)n, (int)this->d}); + + // TODO: I think this can be done on GPU through RAFT k-means + trainQuantizer_(n, hostData.data()); + + // The quantizer is now trained; construct the IVF index + index_.reset(new IVFFlat( + resources_.get(), + quantizer->getGpuData(), + this->metric_type, + this->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace)); + + if (reserveMemoryVecs_) { + index_->reserveMemory(reserveMemoryVecs_); + } + + this->is_trained = true; +} + +int RaftIndexIVFFlat::getListLength(int listId) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListLength(listId); +} + +std::vector RaftIndexIVFFlat::getListVectorData( + int listId, + bool gpuFormat) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListVectorData(listId, gpuFormat); +} + +std::vector RaftIndexIVFFlat::getListIndices(int listId) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListIndices(listId); +} + +void RaftIndexIVFFlat::addImpl_( + int n, + const float* x, + const Index::idx_t* xids) { + // Device is already set in GpuIndex::add + FAISS_ASSERT(index_); + FAISS_ASSERT(n > 0); + + // Data is already resident on the GPU + Tensor data(const_cast(x), {n, (int)this->d}); + Tensor labels(const_cast(xids), {n}); + + // Not all vectors may be able to be added (some may contain NaNs etc) + index_->addVectors(data, labels); + + // but keep the ntotal based on the total number of vectors that we + // attempted to add + ntotal += n; +} + +void RaftIndexIVFFlat::searchImpl_( + int n, + const float* x, + int k, + float* distances, + Index::idx_t* labels) const { + // Device is already set in GpuIndex::search + FAISS_ASSERT(index_); + FAISS_ASSERT(n > 0); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); + + // Data is already resident on the GPU + Tensor queries(const_cast(x), {n, (int)this->d}); + Tensor outDistances(distances, {n, k}); + Tensor outLabels( + const_cast(labels), {n, k}); + + index_->query(queries, nprobe, k, outDistances, outLabels); +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h new file mode 100644 index 0000000000..0ae2a8535a --- /dev/null +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -0,0 +1,48 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace faiss { +struct IndexIVFFlat; +} + +namespace faiss { +namespace gpu { + +class IVFFlat; +class GpuIndexFlat; + +/// Wrapper around the GPU implementation that looks like +/// faiss::gpu::GpuIndexIVFFlat +class RaftIndexIVFFlat : public GpuIndexIVFFlat { + public: + /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying + /// data over to the given GPU, if the input index is trained. + RaftIndexIVFFlat( + GpuResourcesProvider* provider, + const faiss::IndexIVFFlat* index, + GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); + + /// Constructs a new instance with an empty flat quantizer; the user + /// provides the number of lists desired. + RaftIndexIVFFlat( + GpuResourcesProvider* provider, + int dims, + int nlist, + faiss::MetricType metric, + GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); + + ~RaftIndexIVFFlat() override; +}; + +} // namespace gpu +} // namespace faiss From d474bf3d3d798e57027f7eeb550890916b213152 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 3 Jun 2022 11:26:46 -0400 Subject: [PATCH 03/87] Adding proper inherited member definitions --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 16 +++------- faiss/gpu/raft/RaftIndexIVFFlat.h | 51 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index b411e0180a..638d1b56c3 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,15 +27,10 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( GpuResourcesProvider* provider, const faiss::IndexIVFFlat* index, GpuIndexIVFFlatConfig config) - : GpuIndexIVF( + : GpuIndexIVFFlat( provider, - index->d, - index->metric_type, - index->metric_arg, - index->nlist, - config), - ivfFlatConfig_(config), - reserveMemoryVecs_(0) { + index, + config) { copyFrom(index); } @@ -44,9 +40,7 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( int nlist, faiss::MetricType metric, GpuIndexIVFFlatConfig config) - : GpuIndexIVF(provider, dims, metric, 0, nlist, config), - ivfFlatConfig_(config), - reserveMemoryVecs_(0) { + : GpuIndexIVFFlat(provider, dims, nlist, metric, config) { // faiss::Index params this->is_trained = false; diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 0ae2a8535a..2de0782a85 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -42,6 +42,57 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); ~RaftIndexIVFFlat() override; + + /// Reserve GPU memory in our inverted lists for this number of vectors + void reserveMemory(size_t numVecs); + + /// Initialize ourselves from the given CPU index; will overwrite + /// all data in ourselves + void copyFrom(const faiss::IndexIVFFlat* index); + + /// Copy ourselves to the given CPU index; will overwrite all data + /// in the index instance + void copyTo(faiss::IndexIVFFlat* index) const; + + /// After adding vectors, one can call this to reclaim device memory + /// to exactly the amount needed. Returns space reclaimed in bytes + size_t reclaimMemory(); + + /// Clears out all inverted lists, but retains the coarse centroid + /// information + void reset() override; + + /// Trains the coarse quantizer based on the given vector data + void train(Index::idx_t n, const float* x) override; + + /// Returns the number of vectors present in a particular inverted list + int getListLength(int listId) const override; + + /// Return the encoded vector data contained in a particular inverted list, + /// for debugging purposes. + /// If gpuFormat is true, the data is returned as it is encoded in the + /// GPU-side representation. + /// Otherwise, it is converted to the CPU format. + /// compliant format, while the native GPU format may differ. + std::vector getListVectorData(int listId, bool gpuFormat = false) + const override; + + /// Return the vector indices contained in a particular inverted list, for + /// debugging purposes. + std::vector getListIndices(int listId) const override; + + protected: + /// Called from GpuIndex for add/add_with_ids + void addImpl_(int n, const float* x, const Index::idx_t* ids) override; + + /// Called from GpuIndex for search + void searchImpl_( + int n, + const float* x, + int k, + float* distances, + Index::idx_t* labels) const override; + }; } // namespace gpu From 8baee52ce307d12c57a47c205a5e873266652455 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 3 Jun 2022 11:47:30 -0400 Subject: [PATCH 04/87] Updating raft ivf flat --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 51 ------------------------------ faiss/gpu/raft/RaftIndexIVFFlat.h | 15 --------- 2 files changed, 66 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 638d1b56c3..ac2f439141 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -50,57 +50,6 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( RaftIndexIVFFlat::~RaftIndexIVFFlat() {} -void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { - reserveMemoryVecs_ = numVecs; - if (index_) { - DeviceScope scope(config_.device); - index_->reserveMemory(numVecs); - } -} - -void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - DeviceScope scope(config_.device); - - GpuIndexIVF::copyFrom(index); - - // Clear out our old data - index_.reset(); - - // The other index might not be trained - if (!index->is_trained) { - FAISS_ASSERT(!is_trained); - return; - } - - // Otherwise, we can populate ourselves from the other index - FAISS_ASSERT(is_trained); - - // Copy our lists as well - index_.reset(new IVFFlat( - resources_.get(), - quantizer->getGpuData(), - index->metric_type, - index->metric_arg, - false, // no residual - nullptr, // no scalar quantizer - ivfFlatConfig_.interleavedLayout, - ivfFlatConfig_.indicesOptions, - config_.memorySpace)); - - // Copy all of the IVF data - index_->copyInvertedListsFrom(index->invlists); -} - -void RaftIndexIVFFlat::reset() { - if (index_) { - DeviceScope scope(config_.device); - - index_->reset(); - this->ntotal = 0; - } else { - FAISS_ASSERT(this->ntotal == 0); - } -} void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { // For now, only support <= max int results diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 2de0782a85..df2dbd2060 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -43,21 +43,6 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { ~RaftIndexIVFFlat() override; - /// Reserve GPU memory in our inverted lists for this number of vectors - void reserveMemory(size_t numVecs); - - /// Initialize ourselves from the given CPU index; will overwrite - /// all data in ourselves - void copyFrom(const faiss::IndexIVFFlat* index); - - /// Copy ourselves to the given CPU index; will overwrite all data - /// in the index instance - void copyTo(faiss::IndexIVFFlat* index) const; - - /// After adding vectors, one can call this to reclaim device memory - /// to exactly the amount needed. Returns space reclaimed in bytes - size_t reclaimMemory(); - /// Clears out all inverted lists, but retains the coarse centroid /// information void reset() override; From 2eb94f1d538434edc53745a312f790c7734491ec Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 Jun 2022 15:43:53 -0400 Subject: [PATCH 05/87] adding raftIVFFlat implementation --- faiss/gpu/CMakeLists.txt | 4 +- faiss/gpu/raft/RaftIVFFlat.cu | 267 +++++++++++++++++++++++++++++ faiss/gpu/raft/RaftIVFFlat.cuh | 82 +++++++++ faiss/gpu/raft/RaftIndexIVFFlat.cu | 4 +- faiss/gpu/raft/RaftIndexIVFFlat.h | 2 +- 5 files changed, 354 insertions(+), 5 deletions(-) create mode 100644 faiss/gpu/raft/RaftIVFFlat.cu create mode 100644 faiss/gpu/raft/RaftIVFFlat.cuh diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 30a45a7cbd..8a3341d094 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS ) if(FAISS_ENABLE_RAFT) - list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h) - list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu) + list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIVFFlat.cuh) + list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIVFFlat.cu) endif() # Export FAISS_GPU_HEADERS variable to parent scope. diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu new file mode 100644 index 0000000000..b51a7d9dc0 --- /dev/null +++ b/faiss/gpu/raft/RaftIVFFlat.cu @@ -0,0 +1,267 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { +namespace gpu { + +RaftIVFFlat::RaftIVFFlat( + GpuResources* res, + FlatIndex* quantizer, + faiss::MetricType metric, + float metricArg, + bool useResidual, + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) : + IVFFlat(res, quantizer, metric, metricArg, useResidual, scalarQ, interleavedLayout, indicesOptions, space) {} + +RaftIVFFlat::~RaftIVFFlat() {} + +size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { + if (interleavedLayout_) { + // bits per scalar code + int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; + + // bytes to encode a block of 32 vectors (single dimension) + int bytesPerDimBlock = bits * 32 / 8; + + // bytes to fully encode 32 vectors + int bytesPerBlock = bytesPerDimBlock * dim_; + + // number of blocks of 32 vectors we have + int numBlocks = utils::divUp(numVecs, 32); + + // total size to encode numVecs + return bytesPerBlock * numBlocks; + } else { + size_t sizePerVector = + (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); + + return (size_t)numVecs * sizePerVector; + } +} + +size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const { + size_t sizePerVector = + (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); + + return (size_t)numVecs * sizePerVector; +} + +std::vector RaftIVFFlat::translateCodesToGpu_( + std::vector codes, + size_t numVecs) const { + if (!interleavedLayout_) { + // same format + return codes; + } + + int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + + auto up = + unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); + return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); +} + +std::vector RaftIVFFlat::translateCodesFromGpu_( + std::vector codes, + size_t numVecs) const { + if (!interleavedLayout_) { + // same format + return codes; + } + + int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + + auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); + return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); +} + +void RaftIVFFlat::appendVectors_( + Tensor& vecs, + Tensor& indices, + Tensor& uniqueLists, + Tensor& vectorsByUniqueList, + Tensor& uniqueListVectorStart, + Tensor& uniqueListStartOffset, + Tensor& listIds, + Tensor& listOffset, + cudaStream_t stream) { + // + // Append the new encodings + // + + // Calculate residuals for these vectors, if needed + DeviceTensor residuals( + resources_, + makeTempAlloc(AllocType::Other, stream), + {vecs.getSize(0), dim_}); + + if (useResidual_) { + quantizer_->computeResidual(vecs, listIds, residuals); + } + + // Append indices to the IVF lists + runIVFIndicesAppend( + listIds, + listOffset, + indices, + indicesOptions_, + deviceListIndexPointers_, + stream); + + // Append the encoded vectors to the IVF lists + if (interleavedLayout_) { + runIVFFlatInterleavedAppend( + listIds, + listOffset, + uniqueLists, + vectorsByUniqueList, + uniqueListVectorStart, + uniqueListStartOffset, + useResidual_ ? residuals : vecs, + scalarQ_.get(), + deviceListDataPointers_, + resources_, + stream); + } else { + runIVFFlatAppend( + listIds, + listOffset, + useResidual_ ? residuals : vecs, + scalarQ_.get(), + deviceListDataPointers_, + stream); + } +} + +void RaftIVFFlat::query( + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + // These are caught at a higher level + FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K); + FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); + nprobe = std::min(nprobe, quantizer_->getSize()); + + FAISS_ASSERT(queries.getSize(1) == dim_); + + FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); + FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); + + // Reserve space for the quantized information + DeviceTensor coarseDistances( + resources_, + makeTempAlloc(AllocType::Other, stream), + {queries.getSize(0), nprobe}); + DeviceTensor coarseIndices( + resources_, + makeTempAlloc(AllocType::Other, stream), + {queries.getSize(0), nprobe}); + + // Find the `nprobe` closest lists; we can use int indices both + // internally and externally + quantizer_->query( + queries, + nprobe, + metric_, + metricArg_, + coarseDistances, + coarseIndices, + false); + + DeviceTensor residualBase( + resources_, + makeTempAlloc(AllocType::Other, stream), + {queries.getSize(0), nprobe, dim_}); + + if (useResidual_) { + // Reconstruct vectors from the quantizer + quantizer_->reconstruct(coarseIndices, residualBase); + } + + if (interleavedLayout_) { + runIVFInterleavedScan( + queries, + coarseIndices, + deviceListDataPointers_, + deviceListIndexPointers_, + indicesOptions_, + deviceListLengths_, + k, + metric_, + useResidual_, + residualBase, + scalarQ_.get(), + outDistances, + outIndices, + resources_); + } else { + runIVFFlatScan( + queries, + coarseIndices, + deviceListDataPointers_, + deviceListIndexPointers_, + indicesOptions_, + deviceListLengths_, + maxListLength_, + k, + metric_, + useResidual_, + residualBase, + scalarQ_.get(), + outDistances, + outIndices, + resources_); + } + + // If the GPU isn't storing indices (they are on the CPU side), we + // need to perform the re-mapping here + // FIXME: we might ultimately be calling this function with inputs + // from the CPU, these are unnecessary copies + if (indicesOptions_ == INDICES_CPU) { + HostTensor hostOutIndices(outIndices, stream); + + ivfOffsetToUserIndex( + hostOutIndices.data(), + numLists_, + hostOutIndices.getSize(0), + hostOutIndices.getSize(1), + listOffsetToUserIndex_); + + // Copy back to GPU, since the input to this function is on the + // GPU + outIndices.copyFrom(hostOutIndices, stream); + } +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh new file mode 100644 index 0000000000..700a30cc3c --- /dev/null +++ b/faiss/gpu/raft/RaftIVFFlat.cuh @@ -0,0 +1,82 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace faiss { +namespace gpu { + +class RaftIVFFlat : public IVFFlat { + public: + /// Construct from a quantizer that has elemen + RaftIVFFlat( + GpuResources* resources, + /// We do not own this reference + FlatIndex* quantizer, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space); + + ~RaftIVFFlat() override; + + /// Find the approximate k nearest neighbors for `queries` against + /// our database + void query( + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices); + + protected: + /// Returns the number of bytes in which an IVF list containing numVecs + /// vectors is encoded on the device. Note that due to padding this is not + /// the same as the encoding size for a subset of vectors in an IVF list; + /// this is the size for an entire IVF list + size_t getGpuVectorsEncodingSize_(int numVecs) const override; + size_t getCpuVectorsEncodingSize_(int numVecs) const override; + + /// Translate to our preferred GPU encoding + std::vector translateCodesToGpu_( + std::vector codes, + size_t numVecs) const override; + + /// Translate from our preferred GPU encoding + std::vector translateCodesFromGpu_( + std::vector codes, + size_t numVecs) const override; + + /// Encode the vectors that we're adding and append to our IVF lists + void appendVectors_( + Tensor& vecs, + Tensor& indices, + Tensor& uniqueLists, + Tensor& vectorsByUniqueList, + Tensor& uniqueListVectorStart, + Tensor& uniqueListStartOffset, + Tensor& listIds, + Tensor& listOffset, + cudaStream_t stream) override; + + protected: + /// Do we encode the residual from a coarse quantizer or not? + bool useResidual_; + + /// Scalar quantizer for encoded vectors, if any + std::unique_ptr scalarQ_; +}; + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index ac2f439141..50fa155465 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -81,7 +81,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { trainQuantizer_(n, hostData.data()); // The quantizer is now trained; construct the IVF index - index_.reset(new IVFFlat( + index_.reset(new RaftIVFFlat( resources_.get(), quantizer->getGpuData(), this->metric_type, diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index df2dbd2060..206738a834 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -18,7 +18,7 @@ struct IndexIVFFlat; namespace faiss { namespace gpu { -class IVFFlat; +class RaftIVFFlat; class GpuIndexFlat; /// Wrapper around the GPU implementation that looks like From f7d4185a5b858dea138dc43bbe2fc65c442a04ff Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 Jun 2022 16:30:18 -0400 Subject: [PATCH 06/87] Isolating quantizer training --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 36 +++++++++++++++++++++++++++++- faiss/gpu/raft/RaftIndexIVFFlat.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 50fa155465..7cebf3a8da 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -78,9 +78,11 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { {(int)n, (int)this->d}); // TODO: I think this can be done on GPU through RAFT k-means - trainQuantizer_(n, hostData.data()); + trainQuantizer_impl(n, hostData.data()); // The quantizer is now trained; construct the IVF index + + // TODO: The underlying RaftIVFFlat essentially becomes the `query impl` index_.reset(new RaftIVFFlat( resources_.get(), quantizer->getGpuData(), @@ -106,6 +108,38 @@ int RaftIndexIVFFlat::getListLength(int listId) const { return index_->getListLength(listId); } +void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) { + if (n == 0) { + // nothing to do + return; + } + + if (quantizer->is_trained && (quantizer->ntotal == nlist)) { + if (this->verbose) { + printf("IVF quantizer does not need training.\n"); + } + + return; + } + + if (this->verbose) { + printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); + } + + DeviceScope scope(config_.device); + + // leverage the CPU-side k-means code, which works for the GPU + // flat index as well + quantizer->reset(); + Clustering clus(this->d, nlist, this->cp); + clus.verbose = verbose; + clus.train(n, x, *quantizer); + quantizer->is_trained = true; + + FAISS_ASSERT(quantizer->ntotal == nlist); +} + + std::vector RaftIndexIVFFlat::getListVectorData( int listId, bool gpuFormat) const { diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 206738a834..083d0e0eaa 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -70,6 +70,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { /// Called from GpuIndex for add/add_with_ids void addImpl_(int n, const float* x, const Index::idx_t* ids) override; + void trainQuantizer_impl(Index::idx_t n, const float* x); + /// Called from GpuIndex for search void searchImpl_( int n, From 26491cb503e1bd2e96ae8b926f57176556458684 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 Jun 2022 17:13:49 -0400 Subject: [PATCH 07/87] iAdding todos where we need to plug in raft functionality --- faiss/gpu/raft/RaftIVFFlat.cu | 1 + faiss/gpu/raft/RaftIndexIVFFlat.cu | 2 ++ 2 files changed, 3 insertions(+) diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu index b51a7d9dc0..092f71e3e0 100644 --- a/faiss/gpu/raft/RaftIVFFlat.cu +++ b/faiss/gpu/raft/RaftIVFFlat.cu @@ -203,6 +203,7 @@ void RaftIVFFlat::query( makeTempAlloc(AllocType::Other, stream), {queries.getSize(0), nprobe, dim_}); + // TODO: This is where we invoke the search function from RAFT if (useResidual_) { // Reconstruct vectors from the quantizer quantizer_->reconstruct(coarseIndices, residualBase); diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 7cebf3a8da..97064a7ec2 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -131,6 +131,8 @@ void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) { // leverage the CPU-side k-means code, which works for the GPU // flat index as well quantizer->reset(); + + // TODO: Invoke RAFT K-means here Clustering clus(this->d, nlist, this->cp); clus.verbose = verbose; clus.train(n, x, *quantizer); From b4d08c4250b4c7032245d8672215ac0a1970e517 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 17 Jun 2022 15:11:02 -0400 Subject: [PATCH 08/87] Invocatino of index building has been compiled successfully. Still need to propagate kmeans info down into quantized index --- cmake/thirdparty/get_raft.cmake | 4 +- faiss/gpu/CMakeLists.txt | 2 +- faiss/gpu/impl/IVFFlat.cuh | 2 +- faiss/gpu/raft/RaftIVFFlat.cu | 18 ++++- faiss/gpu/raft/RaftIVFFlat.cuh | 2 +- faiss/gpu/raft/RaftIndexFlat.cu | 116 +++++++++++++++++++++++++++++ faiss/gpu/raft/RaftIndexFlat.h | 101 +++++++++++++++++++++++++ faiss/gpu/raft/RaftIndexIVFFlat.cu | 73 ++++++++++-------- faiss/gpu/raft/RaftIndexIVFFlat.h | 5 ++ 9 files changed, 285 insertions(+), 38 deletions(-) create mode 100644 faiss/gpu/raft/RaftIndexFlat.cu create mode 100644 faiss/gpu/raft/RaftIndexFlat.h diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index b171570137..3fc2d9ae34 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -16,8 +16,8 @@ set(RAFT_VERSION "22.04") -set(RAFT_FORK "rapidsai") -set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}") +set(RAFT_FORK "achirkin") +set(RAFT_PINNED_TAG "fea-knn-ivf-flat") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 8a3341d094..3135443179 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -183,5 +183,5 @@ endforeach() find_package(CUDAToolkit REQUIRED) target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft>) target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas) -target_compile_options(faiss PRIVATE $<$:-Xfatbin=-compress-all>) +target_compile_options(faiss PRIVATE $<$:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>) target_compile_options(faiss_avx2 PRIVATE $<$:-Xfatbin=-compress-all>) diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh index 23bbf6cff9..d2b43a9c70 100644 --- a/faiss/gpu/impl/IVFFlat.cuh +++ b/faiss/gpu/impl/IVFFlat.cuh @@ -32,7 +32,7 @@ class IVFFlat : public IVFBase { /// Find the approximate k nearest neigbors for `queries` against /// our database - void query( + virtual void query( Tensor& queries, int nprobe, int k, diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu index 092f71e3e0..0e0cead397 100644 --- a/faiss/gpu/raft/RaftIVFFlat.cu +++ b/faiss/gpu/raft/RaftIVFFlat.cu @@ -21,6 +21,10 @@ #include #include #include + +#include +#include + #include #include @@ -164,9 +168,19 @@ void RaftIVFFlat::query( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices) { auto stream = resources_->getDefaultStreamCurrentDevice(); + // TODO: This is where we invoke the search function from RAFT + /** + * template + void cuivflHandle::cuivflSearch(const T* queries, // [numQueries, dim] + uint32_t n_queries, + uint32_t k, + size_t* neighbors, // [numQueries, topK] + float* distances) + */ + // These are caught at a higher level FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K); FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); @@ -203,7 +217,7 @@ void RaftIVFFlat::query( makeTempAlloc(AllocType::Other, stream), {queries.getSize(0), nprobe, dim_}); - // TODO: This is where we invoke the search function from RAFT + if (useResidual_) { // Reconstruct vectors from the quantizer quantizer_->reconstruct(coarseIndices, residualBase); diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh index 700a30cc3c..4340620639 100644 --- a/faiss/gpu/raft/RaftIVFFlat.cuh +++ b/faiss/gpu/raft/RaftIVFFlat.cuh @@ -38,7 +38,7 @@ class RaftIVFFlat : public IVFFlat { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices); + Tensor& outIndices) override; protected: /// Returns the number of bytes in which an IVF list containing numVecs diff --git a/faiss/gpu/raft/RaftIndexFlat.cu b/faiss/gpu/raft/RaftIndexFlat.cu new file mode 100644 index 0000000000..1c323738c4 --- /dev/null +++ b/faiss/gpu/raft/RaftIndexFlat.cu @@ -0,0 +1,116 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { +namespace gpu { + +// +// RaftIndexFlatL2 +// + +RaftIndexFlatL2::RaftIndexFlatL2( + GpuResourcesProvider* provider, + faiss::IndexFlatL2* index, + GpuIndexFlatConfig config) + : GpuIndexFlat(provider, index, config) {} + +RaftIndexFlatL2::RaftIndexFlatL2( + std::shared_ptr resources, + faiss::IndexFlatL2* index, + GpuIndexFlatConfig config) + : GpuIndexFlat(resources, index, config) {} + +RaftIndexFlatL2::RaftIndexFlatL2( + GpuResourcesProvider* provider, + int dims, + GpuIndexFlatConfig config) + : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {} + +RaftIndexFlatL2::RaftIndexFlatL2( + std::shared_ptr resources, + int dims, + GpuIndexFlatConfig config) + : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {} + +void RaftIndexFlatL2::copyFrom(faiss::IndexFlat* index) { + FAISS_THROW_IF_NOT_MSG( + index->metric_type == metric_type, + "Cannot copy a RaftIndexFlatL2 from an index of " + "different metric_type"); + + GpuIndexFlat::copyFrom(index); +} + +void RaftIndexFlatL2::copyTo(faiss::IndexFlat* index) { + FAISS_THROW_IF_NOT_MSG( + index->metric_type == metric_type, + "Cannot copy a RaftIndexFlatL2 to an index of " + "different metric_type"); + + GpuIndexFlat::copyTo(index); +} + +// +// RaftIndexFlatIP +// + +RaftIndexFlatIP::RaftIndexFlatIP( + GpuResourcesProvider* provider, + faiss::IndexFlatIP* index, + GpuIndexFlatConfig config) + : GpuIndexFlat(provider, index, config) {} + +RaftIndexFlatIP::RaftIndexFlatIP( + std::shared_ptr resources, + faiss::IndexFlatIP* index, + GpuIndexFlatConfig config) + : GpuIndexFlat(resources, index, config) {} + +RaftIndexFlatIP::RaftIndexFlatIP( + GpuResourcesProvider* provider, + int dims, + GpuIndexFlatConfig config) + : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {} + +RaftIndexFlatIP::RaftIndexFlatIP( + std::shared_ptr resources, + int dims, + GpuIndexFlatConfig config) + : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {} + +void RaftIndexFlatIP::copyFrom(faiss::IndexFlat* index) { + FAISS_THROW_IF_NOT_MSG( + index->metric_type == metric_type, + "Cannot copy a RaftIndexFlatIP from an index of " + "different metric_type"); + + GpuIndexFlat::copyFrom(index); +} + +void RaftIndexFlatIP::copyTo(faiss::IndexFlat* index) { + // The passed in index must be IP + FAISS_THROW_IF_NOT_MSG( + index->metric_type == metric_type, + "Cannot copy a RaftIndexFlatIP to an index of " + "different metric_type"); + + GpuIndexFlat::copyTo(index); +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexFlat.h b/faiss/gpu/raft/RaftIndexFlat.h new file mode 100644 index 0000000000..1aa4a51070 --- /dev/null +++ b/faiss/gpu/raft/RaftIndexFlat.h @@ -0,0 +1,101 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace faiss { + +struct IndexFlat; +struct IndexFlatL2; +struct IndexFlatIP; + +} // namespace faiss + +namespace faiss { +namespace gpu { + +class FlatIndex; + +/// Wrapper around the GPU implementation that looks like +/// faiss::IndexFlatL2; copies over centroid data from a given +/// faiss::IndexFlat +class RaftIndexFlatL2 : public GpuIndexFlat { + public: + /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying + /// data over to the given GPU + RaftIndexFlatL2( + GpuResourcesProvider* provider, + faiss::IndexFlatL2* index, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + RaftIndexFlatL2( + std::shared_ptr resources, + faiss::IndexFlatL2* index, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + /// Construct an empty instance that can be added to + RaftIndexFlatL2( + GpuResourcesProvider* provider, + int dims, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + RaftIndexFlatL2( + std::shared_ptr resources, + int dims, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + /// Initialize ourselves from the given CPU index; will overwrite + /// all data in ourselves + void copyFrom(faiss::IndexFlat* index); + + /// Copy ourselves to the given CPU index; will overwrite all data + /// in the index instance + void copyTo(faiss::IndexFlat* index); +}; + +/// Wrapper around the GPU implementation that looks like +/// faiss::IndexFlatIP; copies over centroid data from a given +/// faiss::IndexFlat +class RaftIndexFlatIP : public GpuIndexFlat { + public: + /// Construct from a pre-existing faiss::IndexFlatIP instance, copying + /// data over to the given GPU + RaftIndexFlatIP( + GpuResourcesProvider* provider, + faiss::IndexFlatIP* index, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + RaftIndexFlatIP( + std::shared_ptr resources, + faiss::IndexFlatIP* index, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + /// Construct an empty instance that can be added to + RaftIndexFlatIP( + GpuResourcesProvider* provider, + int dims, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + RaftIndexFlatIP( + std::shared_ptr resources, + int dims, + GpuIndexFlatConfig config = GpuIndexFlatConfig()); + + /// Initialize ourselves from the given CPU index; will overwrite + /// all data in ourselves + void copyFrom(faiss::IndexFlat* index); + + /// Copy ourselves to the given CPU index; will overwrite all data + /// in the index instance + void copyTo(faiss::IndexFlat* index); +}; + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 97064a7ec2..1edaac1b87 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -17,6 +17,8 @@ #include #include +#include +#include #include @@ -69,36 +71,45 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { FAISS_ASSERT(!index_); - // FIXME: GPUize more of this - // First, make sure that the data is resident on the CPU, if it is not on - // the CPU, as we depend upon parts of the CPU code - auto hostData = toHost( - (float*)x, - resources_->getDefaultStream(config_.device), - {(int)n, (int)this->d}); - - // TODO: I think this can be done on GPU through RAFT k-means - trainQuantizer_impl(n, hostData.data()); - - // The quantizer is now trained; construct the IVF index - - // TODO: The underlying RaftIVFFlat essentially becomes the `query impl` - index_.reset(new RaftIVFFlat( - resources_.get(), - quantizer->getGpuData(), - this->metric_type, - this->metric_arg, - false, // no residual - nullptr, // no scalar quantizer - ivfFlatConfig_.interleavedLayout, - ivfFlatConfig_.indicesOptions, - config_.memorySpace)); - - if (reserveMemoryVecs_) { - index_->reserveMemory(reserveMemoryVecs_); - } - - this->is_trained = true; + raft::spatial::knn::ivf_flat_params raft_idx_params; + raft_idx_params.nlist = nlist; + + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; + raft::spatial::knn::approx_knn_build_index(handle, &raft_knn_index, &raft_idx_params, metric, 0.0f, + const_cast(x), n, (faiss::Index::idx_t)d); + +// // FIXME: GPUize more of this +// // First, make sure that the data is resident on the CPU, if it is not on +// // the CPU, as we depend upon parts of the CPU code +// auto hostData = toHost( +// (float*)x, +// resources_->getDefaultStream(config_.device), +// {(int)n, (int)this->d}); +// +// // TODO: I think this can be done on GPU through RAFT k-means +// trainQuantizer_impl(n, hostData.data()); +// +// // The quantizer is now trained; construct the IVF index +// +// // TODO: The underlying RaftIVFFlat essentially becomes the `query impl` +// index_.reset(new RaftIVFFlat( +// resources_.get(), +// +// // TODO: getGpuData returns a `FlatIndex` +// quantizer->getGpuData(), +// this->metric_type, +// this->metric_arg, +// false, // no residual +// nullptr, // no scalar quantizer +// ivfFlatConfig_.interleavedLayout, +// ivfFlatConfig_.indicesOptions, +// config_.memorySpace)); +// +// if (reserveMemoryVecs_) { +// index_->reserveMemory(reserveMemoryVecs_); +// } +// +// this->is_trained = true; } int RaftIndexIVFFlat::getListLength(int listId) const { @@ -132,7 +143,7 @@ void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) { // flat index as well quantizer->reset(); - // TODO: Invoke RAFT K-means here + // TODO: Invoke RAFT K-means here and set resulting trained data on quantizer Clustering clus(this->d, nlist, this->cp); clus.verbose = verbose; clus.train(n, x, *quantizer); diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 083d0e0eaa..dcc28037d6 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -9,6 +9,9 @@ #include #include + +#include +#include #include namespace faiss { @@ -80,6 +83,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { float* distances, Index::idx_t* labels) const override; + const raft::handle_t handle; + raft::spatial::knn::knnIndex raft_knn_index; }; } // namespace gpu From bf876f914e4cd10a9e2cf250eaef556e8ede67ed Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 17 Jun 2022 18:03:47 -0400 Subject: [PATCH 09/87] Adding call to search. --- faiss/gpu/raft/RaftIVFFlat.cu | 2 -- faiss/gpu/raft/RaftIndexIVFFlat.cu | 20 +++++++++++++++----- faiss/gpu/raft/RaftIndexIVFFlat.h | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu index 0e0cead397..4f74804ea9 100644 --- a/faiss/gpu/raft/RaftIVFFlat.cu +++ b/faiss/gpu/raft/RaftIVFFlat.cu @@ -22,8 +22,6 @@ #include #include -#include -#include #include #include diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 1edaac1b87..021f3aaeae 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -16,8 +16,6 @@ #include #include -#include -#include #include #include @@ -71,12 +69,14 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { FAISS_ASSERT(!index_); + // TODO: Populate the rest of the params properly. raft::spatial::knn::ivf_flat_params raft_idx_params; raft_idx_params.nlist = nlist; raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; - raft::spatial::knn::approx_knn_build_index(handle, &raft_knn_index, &raft_idx_params, metric, 0.0f, - const_cast(x), n, (faiss::Index::idx_t)d); + raft::spatial::knn::approx_knn_build_index( + raft_handle, &raft_knn_index, &raft_idx_params, metric, 0.0f, + const_cast(x), n, (faiss::Index::idx_t)d); // // FIXME: GPUize more of this // // First, make sure that the data is resident on the CPU, if it is not on @@ -162,6 +162,9 @@ std::vector RaftIndexIVFFlat::getListVectorData( return index_->getListVectorData(listId, gpuFormat); } +void RaftIndexIVFFlat::reset() { + +} std::vector RaftIndexIVFFlat::getListIndices(int listId) const { FAISS_ASSERT(index_); DeviceScope scope(config_.device); @@ -206,7 +209,14 @@ void RaftIndexIVFFlat::searchImpl_( Tensor outLabels( const_cast(labels), {n, k}); - index_->query(queries, nprobe, k, outDistances, outLabels); + // TODO: Populate the rest of the params properly. + raft::spatial::knn::ivf_flat_params raft_idx_params; + raft_idx_params.nlist = nlist; + + raft::spatial::knn::approx_knn_search( + raft_handle, distances, (int64_t*)labels, + const_cast(&raft_knn_index), + &raft_idx_params, k, const_cast(x), n); } } // namespace gpu diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index dcc28037d6..07e0528be2 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -83,7 +83,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { float* distances, Index::idx_t* labels) const override; - const raft::handle_t handle; + const raft::handle_t raft_handle; raft::spatial::knn::knnIndex raft_knn_index; }; From 9b1fc8422dac50ab46cda5704510e78904d439f2 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 1 Jul 2022 16:48:50 -0400 Subject: [PATCH 10/87] Adding stubs for remaining calls that need to be made from RAFT side in order for FAISS integration to work successfully. --- faiss/gpu/CMakeLists.txt | 4 +- faiss/gpu/raft/RaftIVFFlat.cu | 280 ----------- faiss/gpu/raft/RaftIVFFlat.cuh | 82 --- faiss/gpu/raft/RaftIndexFlat.cu | 116 ----- faiss/gpu/raft/RaftIndexFlat.h | 101 ---- faiss/gpu/raft/RaftIndexIVFFlat.cu | 273 ++++++---- faiss/gpu/raft/RaftIndexIVFFlat.h | 15 +- faiss/gpu/raft/RmmGpuResources.hpp | 636 ++++++++++++++++++++++++ faiss/gpu/test/CMakeLists.txt | 7 +- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 555 +++++++++++++++++++++ 10 files changed, 1378 insertions(+), 691 deletions(-) delete mode 100644 faiss/gpu/raft/RaftIVFFlat.cu delete mode 100644 faiss/gpu/raft/RaftIVFFlat.cuh delete mode 100644 faiss/gpu/raft/RaftIndexFlat.cu delete mode 100644 faiss/gpu/raft/RaftIndexFlat.h create mode 100644 faiss/gpu/raft/RmmGpuResources.hpp create mode 100644 faiss/gpu/test/TestRaftIndexIVFFlat.cpp diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 3135443179..3ed26dca01 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS ) if(FAISS_ENABLE_RAFT) - list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIVFFlat.cuh) - list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIVFFlat.cu) + list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h) + list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu) endif() # Export FAISS_GPU_HEADERS variable to parent scope. diff --git a/faiss/gpu/raft/RaftIVFFlat.cu b/faiss/gpu/raft/RaftIVFFlat.cu deleted file mode 100644 index 4f74804ea9..0000000000 --- a/faiss/gpu/raft/RaftIVFFlat.cu +++ /dev/null @@ -1,280 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include - -namespace faiss { -namespace gpu { - -RaftIVFFlat::RaftIVFFlat( - GpuResources* res, - FlatIndex* quantizer, - faiss::MetricType metric, - float metricArg, - bool useResidual, - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space) : - IVFFlat(res, quantizer, metric, metricArg, useResidual, scalarQ, interleavedLayout, indicesOptions, space) {} - -RaftIVFFlat::~RaftIVFFlat() {} - -size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { - if (interleavedLayout_) { - // bits per scalar code - int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; - - // bytes to encode a block of 32 vectors (single dimension) - int bytesPerDimBlock = bits * 32 / 8; - - // bytes to fully encode 32 vectors - int bytesPerBlock = bytesPerDimBlock * dim_; - - // number of blocks of 32 vectors we have - int numBlocks = utils::divUp(numVecs, 32); - - // total size to encode numVecs - return bytesPerBlock * numBlocks; - } else { - size_t sizePerVector = - (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); - - return (size_t)numVecs * sizePerVector; - } -} - -size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const { - size_t sizePerVector = - (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); - - return (size_t)numVecs * sizePerVector; -} - -std::vector RaftIVFFlat::translateCodesToGpu_( - std::vector codes, - size_t numVecs) const { - if (!interleavedLayout_) { - // same format - return codes; - } - - int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - - auto up = - unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); - return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); -} - -std::vector RaftIVFFlat::translateCodesFromGpu_( - std::vector codes, - size_t numVecs) const { - if (!interleavedLayout_) { - // same format - return codes; - } - - int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - - auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); - return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); -} - -void RaftIVFFlat::appendVectors_( - Tensor& vecs, - Tensor& indices, - Tensor& uniqueLists, - Tensor& vectorsByUniqueList, - Tensor& uniqueListVectorStart, - Tensor& uniqueListStartOffset, - Tensor& listIds, - Tensor& listOffset, - cudaStream_t stream) { - // - // Append the new encodings - // - - // Calculate residuals for these vectors, if needed - DeviceTensor residuals( - resources_, - makeTempAlloc(AllocType::Other, stream), - {vecs.getSize(0), dim_}); - - if (useResidual_) { - quantizer_->computeResidual(vecs, listIds, residuals); - } - - // Append indices to the IVF lists - runIVFIndicesAppend( - listIds, - listOffset, - indices, - indicesOptions_, - deviceListIndexPointers_, - stream); - - // Append the encoded vectors to the IVF lists - if (interleavedLayout_) { - runIVFFlatInterleavedAppend( - listIds, - listOffset, - uniqueLists, - vectorsByUniqueList, - uniqueListVectorStart, - uniqueListStartOffset, - useResidual_ ? residuals : vecs, - scalarQ_.get(), - deviceListDataPointers_, - resources_, - stream); - } else { - runIVFFlatAppend( - listIds, - listOffset, - useResidual_ ? residuals : vecs, - scalarQ_.get(), - deviceListDataPointers_, - stream); - } -} - -void RaftIVFFlat::query( - Tensor& queries, - int nprobe, - int k, - Tensor& outDistances, - Tensor& outIndices) { - auto stream = resources_->getDefaultStreamCurrentDevice(); - - // TODO: This is where we invoke the search function from RAFT - /** - * template - void cuivflHandle::cuivflSearch(const T* queries, // [numQueries, dim] - uint32_t n_queries, - uint32_t k, - size_t* neighbors, // [numQueries, topK] - float* distances) - */ - - // These are caught at a higher level - FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K); - FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); - nprobe = std::min(nprobe, quantizer_->getSize()); - - FAISS_ASSERT(queries.getSize(1) == dim_); - - FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0)); - FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0)); - - // Reserve space for the quantized information - DeviceTensor coarseDistances( - resources_, - makeTempAlloc(AllocType::Other, stream), - {queries.getSize(0), nprobe}); - DeviceTensor coarseIndices( - resources_, - makeTempAlloc(AllocType::Other, stream), - {queries.getSize(0), nprobe}); - - // Find the `nprobe` closest lists; we can use int indices both - // internally and externally - quantizer_->query( - queries, - nprobe, - metric_, - metricArg_, - coarseDistances, - coarseIndices, - false); - - DeviceTensor residualBase( - resources_, - makeTempAlloc(AllocType::Other, stream), - {queries.getSize(0), nprobe, dim_}); - - - if (useResidual_) { - // Reconstruct vectors from the quantizer - quantizer_->reconstruct(coarseIndices, residualBase); - } - - if (interleavedLayout_) { - runIVFInterleavedScan( - queries, - coarseIndices, - deviceListDataPointers_, - deviceListIndexPointers_, - indicesOptions_, - deviceListLengths_, - k, - metric_, - useResidual_, - residualBase, - scalarQ_.get(), - outDistances, - outIndices, - resources_); - } else { - runIVFFlatScan( - queries, - coarseIndices, - deviceListDataPointers_, - deviceListIndexPointers_, - indicesOptions_, - deviceListLengths_, - maxListLength_, - k, - metric_, - useResidual_, - residualBase, - scalarQ_.get(), - outDistances, - outIndices, - resources_); - } - - // If the GPU isn't storing indices (they are on the CPU side), we - // need to perform the re-mapping here - // FIXME: we might ultimately be calling this function with inputs - // from the CPU, these are unnecessary copies - if (indicesOptions_ == INDICES_CPU) { - HostTensor hostOutIndices(outIndices, stream); - - ivfOffsetToUserIndex( - hostOutIndices.data(), - numLists_, - hostOutIndices.getSize(0), - hostOutIndices.getSize(1), - listOffsetToUserIndex_); - - // Copy back to GPU, since the input to this function is on the - // GPU - outIndices.copyFrom(hostOutIndices, stream); - } -} - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/raft/RaftIVFFlat.cuh b/faiss/gpu/raft/RaftIVFFlat.cuh deleted file mode 100644 index 4340620639..0000000000 --- a/faiss/gpu/raft/RaftIVFFlat.cuh +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace faiss { -namespace gpu { - -class RaftIVFFlat : public IVFFlat { - public: - /// Construct from a quantizer that has elemen - RaftIVFFlat( - GpuResources* resources, - /// We do not own this reference - FlatIndex* quantizer, - faiss::MetricType metric, - float metricArg, - bool useResidual, - /// Optional ScalarQuantizer - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space); - - ~RaftIVFFlat() override; - - /// Find the approximate k nearest neighbors for `queries` against - /// our database - void query( - Tensor& queries, - int nprobe, - int k, - Tensor& outDistances, - Tensor& outIndices) override; - - protected: - /// Returns the number of bytes in which an IVF list containing numVecs - /// vectors is encoded on the device. Note that due to padding this is not - /// the same as the encoding size for a subset of vectors in an IVF list; - /// this is the size for an entire IVF list - size_t getGpuVectorsEncodingSize_(int numVecs) const override; - size_t getCpuVectorsEncodingSize_(int numVecs) const override; - - /// Translate to our preferred GPU encoding - std::vector translateCodesToGpu_( - std::vector codes, - size_t numVecs) const override; - - /// Translate from our preferred GPU encoding - std::vector translateCodesFromGpu_( - std::vector codes, - size_t numVecs) const override; - - /// Encode the vectors that we're adding and append to our IVF lists - void appendVectors_( - Tensor& vecs, - Tensor& indices, - Tensor& uniqueLists, - Tensor& vectorsByUniqueList, - Tensor& uniqueListVectorStart, - Tensor& uniqueListStartOffset, - Tensor& listIds, - Tensor& listOffset, - cudaStream_t stream) override; - - protected: - /// Do we encode the residual from a coarse quantizer or not? - bool useResidual_; - - /// Scalar quantizer for encoded vectors, if any - std::unique_ptr scalarQ_; -}; - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexFlat.cu b/faiss/gpu/raft/RaftIndexFlat.cu deleted file mode 100644 index 1c323738c4..0000000000 --- a/faiss/gpu/raft/RaftIndexFlat.cu +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace faiss { -namespace gpu { - -// -// RaftIndexFlatL2 -// - -RaftIndexFlatL2::RaftIndexFlatL2( - GpuResourcesProvider* provider, - faiss::IndexFlatL2* index, - GpuIndexFlatConfig config) - : GpuIndexFlat(provider, index, config) {} - -RaftIndexFlatL2::RaftIndexFlatL2( - std::shared_ptr resources, - faiss::IndexFlatL2* index, - GpuIndexFlatConfig config) - : GpuIndexFlat(resources, index, config) {} - -RaftIndexFlatL2::RaftIndexFlatL2( - GpuResourcesProvider* provider, - int dims, - GpuIndexFlatConfig config) - : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {} - -RaftIndexFlatL2::RaftIndexFlatL2( - std::shared_ptr resources, - int dims, - GpuIndexFlatConfig config) - : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {} - -void RaftIndexFlatL2::copyFrom(faiss::IndexFlat* index) { - FAISS_THROW_IF_NOT_MSG( - index->metric_type == metric_type, - "Cannot copy a RaftIndexFlatL2 from an index of " - "different metric_type"); - - GpuIndexFlat::copyFrom(index); -} - -void RaftIndexFlatL2::copyTo(faiss::IndexFlat* index) { - FAISS_THROW_IF_NOT_MSG( - index->metric_type == metric_type, - "Cannot copy a RaftIndexFlatL2 to an index of " - "different metric_type"); - - GpuIndexFlat::copyTo(index); -} - -// -// RaftIndexFlatIP -// - -RaftIndexFlatIP::RaftIndexFlatIP( - GpuResourcesProvider* provider, - faiss::IndexFlatIP* index, - GpuIndexFlatConfig config) - : GpuIndexFlat(provider, index, config) {} - -RaftIndexFlatIP::RaftIndexFlatIP( - std::shared_ptr resources, - faiss::IndexFlatIP* index, - GpuIndexFlatConfig config) - : GpuIndexFlat(resources, index, config) {} - -RaftIndexFlatIP::RaftIndexFlatIP( - GpuResourcesProvider* provider, - int dims, - GpuIndexFlatConfig config) - : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {} - -RaftIndexFlatIP::RaftIndexFlatIP( - std::shared_ptr resources, - int dims, - GpuIndexFlatConfig config) - : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {} - -void RaftIndexFlatIP::copyFrom(faiss::IndexFlat* index) { - FAISS_THROW_IF_NOT_MSG( - index->metric_type == metric_type, - "Cannot copy a RaftIndexFlatIP from an index of " - "different metric_type"); - - GpuIndexFlat::copyFrom(index); -} - -void RaftIndexFlatIP::copyTo(faiss::IndexFlat* index) { - // The passed in index must be IP - FAISS_THROW_IF_NOT_MSG( - index->metric_type == metric_type, - "Cannot copy a RaftIndexFlatIP to an index of " - "different metric_type"); - - GpuIndexFlat::copyTo(index); -} - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexFlat.h b/faiss/gpu/raft/RaftIndexFlat.h deleted file mode 100644 index 1aa4a51070..0000000000 --- a/faiss/gpu/raft/RaftIndexFlat.h +++ /dev/null @@ -1,101 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -namespace faiss { - -struct IndexFlat; -struct IndexFlatL2; -struct IndexFlatIP; - -} // namespace faiss - -namespace faiss { -namespace gpu { - -class FlatIndex; - -/// Wrapper around the GPU implementation that looks like -/// faiss::IndexFlatL2; copies over centroid data from a given -/// faiss::IndexFlat -class RaftIndexFlatL2 : public GpuIndexFlat { - public: - /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying - /// data over to the given GPU - RaftIndexFlatL2( - GpuResourcesProvider* provider, - faiss::IndexFlatL2* index, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - RaftIndexFlatL2( - std::shared_ptr resources, - faiss::IndexFlatL2* index, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - /// Construct an empty instance that can be added to - RaftIndexFlatL2( - GpuResourcesProvider* provider, - int dims, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - RaftIndexFlatL2( - std::shared_ptr resources, - int dims, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - /// Initialize ourselves from the given CPU index; will overwrite - /// all data in ourselves - void copyFrom(faiss::IndexFlat* index); - - /// Copy ourselves to the given CPU index; will overwrite all data - /// in the index instance - void copyTo(faiss::IndexFlat* index); -}; - -/// Wrapper around the GPU implementation that looks like -/// faiss::IndexFlatIP; copies over centroid data from a given -/// faiss::IndexFlat -class RaftIndexFlatIP : public GpuIndexFlat { - public: - /// Construct from a pre-existing faiss::IndexFlatIP instance, copying - /// data over to the given GPU - RaftIndexFlatIP( - GpuResourcesProvider* provider, - faiss::IndexFlatIP* index, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - RaftIndexFlatIP( - std::shared_ptr resources, - faiss::IndexFlatIP* index, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - /// Construct an empty instance that can be added to - RaftIndexFlatIP( - GpuResourcesProvider* provider, - int dims, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - RaftIndexFlatIP( - std::shared_ptr resources, - int dims, - GpuIndexFlatConfig config = GpuIndexFlatConfig()); - - /// Initialize ourselves from the given CPU index; will overwrite - /// all data in ourselves - void copyFrom(faiss::IndexFlat* index); - - /// Copy ourselves to the given CPU index; will overwrite all data - /// in the index instance - void copyTo(faiss::IndexFlat* index); -}; - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 021f3aaeae..ffc0b3e2c4 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -12,11 +12,10 @@ #include #include #include -#include #include #include -#include +#include #include @@ -30,7 +29,8 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( : GpuIndexIVFFlat( provider, index, - config) { + config), raft_handle(resources_->getDefaultStream(config_.device)) { + copyFrom(index); } @@ -40,136 +40,181 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( int nlist, faiss::MetricType metric, GpuIndexIVFFlatConfig config) - : GpuIndexIVFFlat(provider, dims, nlist, metric, config) { - // faiss::Index params - this->is_trained = false; + : GpuIndexIVFFlat(provider, dims, nlist, metric, config), + raft_handle(resources_->getDefaultStream(config_.device)) { - // We haven't trained ourselves, so don't construct the IVFFlat - // index yet + this->is_trained = false; } RaftIndexIVFFlat::~RaftIndexIVFFlat() {} +void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { -void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { - // For now, only support <= max int results - FAISS_THROW_IF_NOT_FMT( - n <= (Index::idx_t)std::numeric_limits::max(), - "GPU index only supports up to %d indices", - std::numeric_limits::max()); + printf("Copying from...\n"); + // TODO: Need to copy necessary memory from the index and set any needed params. DeviceScope scope(config_.device); - if (this->is_trained) { - FAISS_ASSERT(quantizer->is_trained); - FAISS_ASSERT(quantizer->ntotal == nlist); - FAISS_ASSERT(index_); + GpuIndex::copyFrom(index); + + FAISS_ASSERT(index->nlist > 0); + FAISS_THROW_IF_NOT_FMT( + index->nlist <= (Index::idx_t)std::numeric_limits::max(), + "GPU index only supports %zu inverted lists", + (size_t)std::numeric_limits::max()); + nlist = index->nlist; + + FAISS_THROW_IF_NOT_FMT( + index->nprobe > 0 && index->nprobe <= getMaxKSelection(), + "GPU index only supports nprobe <= %zu; passed %zu", + (size_t)getMaxKSelection(), + index->nprobe); + nprobe = index->nprobe; + + config.device = config_.device; + + FAISS_ASSERT(metric_type != faiss::METRIC_L2 && + metric_type != faiss::METRIC_INNER_PRODUCT); + + if (!index->is_trained) { + // copied in GpuIndex::copyFrom + FAISS_ASSERT(!is_trained && ntotal == 0); return; } - FAISS_ASSERT(!index_); + // copied in GpuIndex::copyFrom + // ntotal can exceed max int, but the number of vectors per inverted + // list cannot exceed this. We check this in the subclasses. + FAISS_ASSERT(is_trained && (ntotal == index->ntotal)); - // TODO: Populate the rest of the params properly. - raft::spatial::knn::ivf_flat_params raft_idx_params; - raft_idx_params.nlist = nlist; - - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; - raft::spatial::knn::approx_knn_build_index( - raft_handle, &raft_knn_index, &raft_idx_params, metric, 0.0f, - const_cast(x), n, (faiss::Index::idx_t)d); - -// // FIXME: GPUize more of this -// // First, make sure that the data is resident on the CPU, if it is not on -// // the CPU, as we depend upon parts of the CPU code -// auto hostData = toHost( -// (float*)x, -// resources_->getDefaultStream(config_.device), -// {(int)n, (int)this->d}); -// -// // TODO: I think this can be done on GPU through RAFT k-means -// trainQuantizer_impl(n, hostData.data()); -// -// // The quantizer is now trained; construct the IVF index -// -// // TODO: The underlying RaftIVFFlat essentially becomes the `query impl` -// index_.reset(new RaftIVFFlat( -// resources_.get(), -// -// // TODO: getGpuData returns a `FlatIndex` -// quantizer->getGpuData(), -// this->metric_type, -// this->metric_arg, -// false, // no residual -// nullptr, // no scalar quantizer -// ivfFlatConfig_.interleavedLayout, -// ivfFlatConfig_.indicesOptions, -// config_.memorySpace)); -// -// if (reserveMemoryVecs_) { -// index_->reserveMemory(reserveMemoryVecs_); -// } -// -// this->is_trained = true; -} + // Since we're trained, the quantizer must have data + FAISS_ASSERT(index->quantizer->ntotal > 0); -int RaftIndexIVFFlat::getListLength(int listId) const { - FAISS_ASSERT(index_); - DeviceScope scope(config_.device); + raft::spatial::knn::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - return index_->getListLength(listId); + // TODO: Invoke corresponding call on the RAFT side to copy quantizer + /** + * For example: + * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index( + * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); + */ } -void RaftIndexIVFFlat::trainQuantizer_impl(Index::idx_t n, const float* x) { - if (n == 0) { - // nothing to do - return; +void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { + + std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl; + reserveMemoryVecs_ = numVecs; + if (raft_knn_index.has_value()) { + DeviceScope scope(config_.device); + + // TODO: We need to reserve memory on the raft::ivf_flat::index + /** + * For example: + * raft::spatial::knn::ivf_flat::ivf_flat_allocate_ivf_lists( + * raft_handle, *raft_knn_index, numVecs); + */ } +} + +size_t RaftIndexIVFFlat::reclaimMemory() { + std::cout << "Reclaiming memory" << std::endl; - if (quantizer->is_trained && (quantizer->ntotal == nlist)) { - if (this->verbose) { - printf("IVF quantizer does not need training.\n"); - } + // TODO: We need to reclaim memory on the raft::ivf_flat::index + /** + * For example: + * raft::spatial::knn::ivf_flat::ivf_flat_reclaim_ivf_lists( + * raft_handle, *raft_knn_index, numVecs); + */ + return 0; +} +void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { + // For now, only support <= max int results + FAISS_THROW_IF_NOT_FMT( + n <= (Index::idx_t)std::numeric_limits::max(), + "GPU index only supports up to %d indices", + std::numeric_limits::max()); + + DeviceScope scope(config_.device); + + if (this->is_trained) { + FAISS_ASSERT(raft_knn_index.has_value()); return; } - if (this->verbose) { - printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); - } + raft::spatial::knn::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - DeviceScope scope(config_.device); - // leverage the CPU-side k-means code, which works for the GPU - // flat index as well - quantizer->reset(); + // TODO: This should only train the quantizer portion of the index + /** + * For example: + * + * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index( + * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); - // TODO: Invoke RAFT K-means here and set resulting trained data on quantizer - Clustering clus(this->d, nlist, this->cp); - clus.verbose = verbose; - clus.train(n, x, *quantizer); - quantizer->is_trained = true; + * raft::spatial::knn::ivf_flat::ivf_flat_train_quantizer( + * raft_handle, *raft_knn_index, const_cast(x), n); + */ - FAISS_ASSERT(quantizer->ntotal == nlist); + raft_knn_index.emplace( + raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, + const_cast(x), + n, (faiss::Index::idx_t)d, + raft_handle.get_stream())); + + raft_handle.sync_stream(); } +int RaftIndexIVFFlat::getListLength(int listId) const { + FAISS_ASSERT(raft_knn_index.has_value()); + DeviceScope scope(config_.device); + + // TODO: Call function in RAFT to do this. + /** + * For example: + * raft::spatial::knn::ivf_flat::get_list_length( + * raft_handle, *raft_knn_index, listId); + */ + return 0; +} std::vector RaftIndexIVFFlat::getListVectorData( int listId, bool gpuFormat) const { - FAISS_ASSERT(index_); + FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - return index_->getListVectorData(listId, gpuFormat); + // TODO: Invoke corresponding call in raft::ivf_flat + /** + * For example: + * raft::spatial::knn::ivf_flat::get_list_vector_data( + * raft_handle, *raft_knn_index, listId, gpuFormat); + */ + std::vector vec; + return vec; } void RaftIndexIVFFlat::reset() { - + std::cout << "Calling reset()" << std::endl; + raft_knn_index.reset(); } + std::vector RaftIndexIVFFlat::getListIndices(int listId) const { - FAISS_ASSERT(index_); + FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - return index_->getListIndices(listId); + // TODO: Need to invoke corresponding call in raft::ivf_flat + /** + * For example: + * raft::spatial::knn::ivf_flat::get_list_indices( + * raft_handle, *raft_knn_index, listId); + */ + std::vector vec; + return vec; } void RaftIndexIVFFlat::addImpl_( @@ -177,19 +222,29 @@ void RaftIndexIVFFlat::addImpl_( const float* x, const Index::idx_t* xids) { // Device is already set in GpuIndex::add - FAISS_ASSERT(index_); + FAISS_ASSERT(raft_knn_index.has_value()); FAISS_ASSERT(n > 0); - // Data is already resident on the GPU + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int)this->d}); Tensor labels(const_cast(xids), {n}); - // Not all vectors may be able to be added (some may contain NaNs etc) - index_->addVectors(data, labels); - - // but keep the ntotal based on the total number of vectors that we - // attempted to add +// // Not all vectors may be able to be added (some may contain NaNs etc) +// index_->addVectors(data, labels); +// +// // but keep the ntotal based on the total number of vectors that we +// // attempted to add ntotal += n; + + std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; + + // TODO: Invoke corresponding call in raft::ivf_flat + /** + * For example: + * raft::spatial::knn::ivf_flat::ivf_flat_add_vectors( + * raft_handle, *raft_knn_index, n, x, xids); + */ + } void RaftIndexIVFFlat::searchImpl_( @@ -199,7 +254,7 @@ void RaftIndexIVFFlat::searchImpl_( float* distances, Index::idx_t* labels) const { // Device is already set in GpuIndex::search - FAISS_ASSERT(index_); + FAISS_ASSERT(raft_knn_index.has_value()); FAISS_ASSERT(n > 0); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); @@ -210,13 +265,19 @@ void RaftIndexIVFFlat::searchImpl_( const_cast(labels), {n, k}); // TODO: Populate the rest of the params properly. - raft::spatial::knn::ivf_flat_params raft_idx_params; - raft_idx_params.nlist = nlist; - - raft::spatial::knn::approx_knn_search( - raft_handle, distances, (int64_t*)labels, - const_cast(&raft_knn_index), - &raft_idx_params, k, const_cast(x), n); + raft::spatial::knn::ivf_flat::search_params raft_idx_params; + raft_idx_params.n_probes = nprobe; + + raft::spatial::knn::ivf_flat::search(raft_handle, + raft_idx_params, + *raft_knn_index, + const_cast(x), + static_cast(n), + static_cast(k), + static_cast(labels), + distances, raft_handle.get_stream()); + + raft_handle.sync_stream(); } } // namespace gpu diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 07e0528be2..4960fa3ae1 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -11,7 +11,8 @@ #include #include -#include +#include + #include namespace faiss { @@ -56,6 +57,15 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { /// Returns the number of vectors present in a particular inverted list int getListLength(int listId) const override; + /// Reserve GPU memory in our inverted lists for this number of vectors + void reserveMemory(size_t numVecs); + + /// After adding vectors, one can call this to reclaim device memory + /// to exactly the amount needed. Returns space reclaimed in bytes + size_t reclaimMemory(); + + void copyFrom(const faiss::IndexIVFFlat* index); + /// Return the encoded vector data contained in a particular inverted list, /// for debugging purposes. /// If gpuFormat is true, the data is returned as it is encoded in the @@ -73,7 +83,6 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { /// Called from GpuIndex for add/add_with_ids void addImpl_(int n, const float* x, const Index::idx_t* ids) override; - void trainQuantizer_impl(Index::idx_t n, const float* x); /// Called from GpuIndex for search void searchImpl_( @@ -84,7 +93,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { Index::idx_t* labels) const override; const raft::handle_t raft_handle; - raft::spatial::knn::knnIndex raft_knn_index; + std::optional> raft_knn_index{std::nullopt}; }; } // namespace gpu diff --git a/faiss/gpu/raft/RmmGpuResources.hpp b/faiss/gpu/raft/RmmGpuResources.hpp new file mode 100644 index 0000000000..e3bc306729 --- /dev/null +++ b/faiss/gpu/raft/RmmGpuResources.hpp @@ -0,0 +1,636 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* +This code contains unnecessary code duplication. These could be deleted +once the relevant changes would be made on the FAISS side. Indeed most of +the logic in the below code is similar to FAISS's standard implementation +and should thus be inherited instead of duplicated. This FAISS's issue +once solved should allow the removal of the unnecessary duplicates +in this file : https://github.com/facebookresearch/faiss/issues/2097 +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { +namespace gpu { + +namespace { + +// How many streams per device we allocate by default (for multi-streaming) +constexpr int kNumStreams = 2; + +// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default +constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024; + +// Default temporary memory allocation for <= 4 GiB memory GPUs +constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024; + +// Default temporary memory allocation for <= 8 GiB memory GPUs +constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024; + +// Maximum temporary memory allocation for all GPUs +constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024; + +std::string allocsToString(const std::unordered_map& map) +{ + // Produce a sorted list of all outstanding allocations by type + std::unordered_map> stats; + + for (auto& entry : map) { + auto& a = entry.second; + + auto it = stats.find(a.type); + if (it != stats.end()) { + stats[a.type].first++; + stats[a.type].second += a.size; + } else { + stats[a.type] = std::make_pair(1, a.size); + } + } + + std::stringstream ss; + for (auto& entry : stats) { + ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first + << " allocations, " << entry.second.second << " bytes\n"; + } + + return ss.str(); +} + +} // namespace + +/// RMM implementation of the GpuResources object that provides for a +/// temporary memory manager +class RmmGpuResourcesImpl : public GpuResources { + public: + RmmGpuResourcesImpl() + : pinnedMemAlloc_(nullptr), + pinnedMemAllocSize_(0), + // let the adjustment function determine the memory size for us by passing + // in a huge value that will then be adjusted + tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits::max())), + pinnedMemSize_(kDefaultPinnedMemoryAllocation), + allocLogging_(false), + cmr(new rmm::mr::cuda_memory_resource), + mmr(new rmm::mr::managed_memory_resource), + pmr(new rmm::mr::pinned_memory_resource){}; + + ~RmmGpuResourcesImpl() + { + // The temporary memory allocator has allocated memory through us, so clean + // that up before we finish fully de-initializing ourselves + tempMemory_.clear(); + + // Make sure all allocations have been freed + bool allocError = false; + + for (auto& entry : allocs_) { + auto& map = entry.second; + + if (!map.empty()) { + std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n" + << "Device " << entry.first << " outstanding allocations:\n"; + std::cerr << allocsToString(map); + allocError = true; + } + } + + FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up"); + + for (auto& entry : defaultStreams_) { + DeviceScope scope(entry.first); + + // We created these streams, so are responsible for destroying them + CUDA_VERIFY(cudaStreamDestroy(entry.second)); + } + + for (auto& entry : alternateStreams_) { + DeviceScope scope(entry.first); + + for (auto stream : entry.second) { + CUDA_VERIFY(cudaStreamDestroy(stream)); + } + } + + for (auto& entry : asyncCopyStreams_) { + DeviceScope scope(entry.first); + + CUDA_VERIFY(cudaStreamDestroy(entry.second)); + } + + for (auto& entry : blasHandles_) { + DeviceScope scope(entry.first); + + auto blasStatus = cublasDestroy(entry.second); + FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); + } + + if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); } + }; + + /// Disable allocation of temporary memory; all temporary memory + /// requests will call cudaMalloc / cudaFree at the point of use + void noTempMemory() { setTempMemory(0); }; + + /// Specify that we wish to use a certain fixed size of memory on + /// all devices as temporary memory. This is the upper bound for the GPU + /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; + /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. + /// To avoid any temporary memory allocation, pass 0. + void setTempMemory(size_t size) + { + if (tempMemSize_ != size) { + // adjust based on general limits + tempMemSize_ = getDefaultTempMemForGPU(-1, size); + + // We need to re-initialize memory resources for all current devices that + // have been initialized. + // This should be safe to do, even if we are currently running work, because + // the cudaFree call that this implies will force-synchronize all GPUs with + // the CPU + for (auto& p : tempMemory_) { + int device = p.first; + // Free the existing memory first + p.second.reset(); + + // Allocate new + p.second = std::unique_ptr( + new StackDeviceMemory(this, + p.first, + // adjust for this specific device + getDefaultTempMemForGPU(device, tempMemSize_))); + } + } + }; + + /// Set amount of pinned memory to allocate, for async GPU <-> CPU + /// transfers + void setPinnedMemory(size_t size) + { + // Should not call this after devices have been initialized + FAISS_ASSERT(defaultStreams_.size() == 0); + FAISS_ASSERT(!pinnedMemAlloc_); + + pinnedMemSize_ = size; + }; + + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + void setDefaultStream(int device, cudaStream_t stream) + { + if (isInitialized(device)) { + // A new series of calls may not be ordered with what was the previous + // stream, so if the stream being specified is different, then we need to + // ensure ordering between the two (new stream waits on old). + auto it = userDefaultStreams_.find(device); + cudaStream_t prevStream = nullptr; + + if (it != userDefaultStreams_.end()) { + prevStream = it->second; + } else { + FAISS_ASSERT(defaultStreams_.count(device)); + prevStream = defaultStreams_[device]; + } + + if (prevStream != stream) { streamWait({stream}, {prevStream}); } + } + + userDefaultStreams_[device] = stream; + }; + + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device) + { + if (isInitialized(device)) { + auto it = userDefaultStreams_.find(device); + + if (it != userDefaultStreams_.end()) { + // There was a user stream set that we need to synchronize against + cudaStream_t prevStream = userDefaultStreams_[device]; + + FAISS_ASSERT(defaultStreams_.count(device)); + cudaStream_t newStream = defaultStreams_[device]; + + streamWait({newStream}, {prevStream}); + } + } + + userDefaultStreams_.erase(device); + }; + + /// Returns the stream for the given device on which all Faiss GPU work is + /// ordered. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + cudaStream_t getDefaultStream(int device) + { + initializeForDevice(device); + + auto it = userDefaultStreams_.find(device); + if (it != userDefaultStreams_.end()) { + // There is a user override stream set + return it->second; + } + + // Otherwise, our base default stream + return defaultStreams_[device]; + }; + + /// Called to change the work ordering streams to the null stream + /// for all devices + void setDefaultNullStreamAllDevices() + { + for (int dev = 0; dev < getNumDevices(); ++dev) { + setDefaultStream(dev, nullptr); + } + }; + + /// If enabled, will print every GPU memory allocation and deallocation to + /// standard output + void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; }; + + public: + /// Internal system calls + + /// Initialize resources for this device + void initializeForDevice(int device) + { + if (isInitialized(device)) { return; } + + // If this is the first device that we're initializing, create our + // pinned memory allocation + if (defaultStreams_.empty() && pinnedMemSize_ > 0) { + pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_); + pinnedMemAllocSize_ = pinnedMemSize_; + } + + FAISS_ASSERT(device < getNumDevices()); + DeviceScope scope(device); + + // Make sure that device properties for all devices are cached + auto& prop = getDeviceProperties(device); + + // Also check to make sure we meet our minimum compute capability (3.0) + FAISS_ASSERT_FMT(prop.major >= 3, + "Device id %d with CC %d.%d not supported, " + "need 3.0+ compute capability", + device, + prop.major, + prop.minor); + + // Create streams + cudaStream_t defaultStream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking)); + + defaultStreams_[device] = defaultStream; + + cudaStream_t asyncCopyStream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking)); + + asyncCopyStreams_[device] = asyncCopyStream; + + std::vector deviceStreams; + for (int j = 0; j < kNumStreams; ++j) { + cudaStream_t stream = 0; + CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + deviceStreams.push_back(stream); + } + + alternateStreams_[device] = std::move(deviceStreams); + + // Create cuBLAS handle + cublasHandle_t blasHandle = 0; + auto blasStatus = cublasCreate(&blasHandle); + FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); + blasHandles_[device] = blasHandle; + + // For CUDA 10 on V100, enabling tensor core usage would enable automatic + // rounding down of inputs to f16 (though accumulate in f32) which results in + // unacceptable loss of precision in general. + // For CUDA 11 / A100, only enable tensor core support if it doesn't result in + // a loss of precision. +#if CUDA_VERSION >= 11000 + cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); +#endif + + FAISS_ASSERT(allocs_.count(device) == 0); + allocs_[device] = std::unordered_map(); + + FAISS_ASSERT(tempMemory_.count(device) == 0); + auto mem = std::unique_ptr( + new StackDeviceMemory(this, + device, + // adjust for this specific device + getDefaultTempMemForGPU(device, tempMemSize_))); + + tempMemory_.emplace(device, std::move(mem)); + }; + + cublasHandle_t getBlasHandle(int device) + { + initializeForDevice(device); + return blasHandles_[device]; + }; + + std::vector getAlternateStreams(int device) + { + initializeForDevice(device); + return alternateStreams_[device]; + }; + + /// Allocate non-temporary GPU memory + void* allocMemory(const AllocRequest& req) + { + initializeForDevice(req.device); + + // We don't allocate a placeholder for zero-sized allocations + if (req.size == 0) { return nullptr; } + + // Make sure that the allocation is a multiple of 16 bytes for alignment + // purposes + auto adjReq = req; + adjReq.size = utils::roundUp(adjReq.size, (size_t)16); + + void* p = nullptr; + + if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; } + + if (adjReq.space == MemorySpace::Temporary) { + // If we don't have enough space in our temporary memory manager, we need + // to allocate this request separately + auto& tempMem = tempMemory_[adjReq.device]; + + if (adjReq.size > tempMem->getSizeAvailable()) { + // We need to allocate this ourselves + AllocRequest newReq = adjReq; + newReq.space = MemorySpace::Device; + newReq.type = AllocType::TemporaryMemoryOverflow; + + return allocMemory(newReq); + } + + // Otherwise, we can handle this locally + p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size); + + } else if (adjReq.space == MemorySpace::Device) { + p = cmr->allocate(adjReq.size, adjReq.stream); + } else if (adjReq.space == MemorySpace::Unified) { + p = mmr->allocate(adjReq.size, adjReq.stream); + } else { + FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space); + } + + allocs_[adjReq.device][p] = adjReq; + + return p; + }; + + /// Returns a previous allocation + void deallocMemory(int device, void* p) + { + FAISS_ASSERT(isInitialized(device)); + + if (!p) { return; } + + auto& a = allocs_[device]; + auto it = a.find(p); + FAISS_ASSERT(it != a.end()); + + auto& req = it->second; + + if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; } + + if (req.space == MemorySpace::Temporary) { + tempMemory_[device]->deallocMemory(device, req.stream, req.size, p); + } else if (req.space == MemorySpace::Device) { + cmr->deallocate(p, req.size, req.stream); + } else if (req.space == MemorySpace::Unified) { + mmr->deallocate(p, req.size, req.stream); + } else { + FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space); + } + + a.erase(it); + }; + + size_t getTempMemoryAvailable(int device) const + { + FAISS_ASSERT(isInitialized(device)); + + auto it = tempMemory_.find(device); + FAISS_ASSERT(it != tempMemory_.end()); + + return it->second->getSizeAvailable(); + }; + + /// Export a description of memory used for Python + std::map>> getMemoryInfo() const + { + using AT = std::map>; + + std::map out; + + for (auto& entry : allocs_) { + AT outDevice; + + for (auto& a : entry.second) { + auto& v = outDevice[allocTypeToString(a.second.type)]; + v.first++; + v.second += a.second.size; + } + + out[entry.first] = std::move(outDevice); + } + + return out; + }; + + std::pair getPinnedMemory() + { + return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_); + }; + + cudaStream_t getAsyncCopyStream(int device) + { + initializeForDevice(device); + return asyncCopyStreams_[device]; + }; + + private: + /// Have GPU resources been initialized for this device yet? + bool isInitialized(int device) const + { + // Use default streams as a marker for whether or not a certain + // device has been initialized + return defaultStreams_.count(device) != 0; + }; + + /// Adjust the default temporary memory allocation based on the total GPU + /// memory size + static size_t getDefaultTempMemForGPU(int device, size_t requested) + { + auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem + : std::numeric_limits::max(); + + if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) { + // If the GPU has <= 4 GiB of memory, reserve 512 MiB + + if (requested > k4GiBTempMem) { return k4GiBTempMem; } + } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) { + // If the GPU has <= 8 GiB of memory, reserve 1 GiB + + if (requested > k8GiBTempMem) { return k8GiBTempMem; } + } else { + // Never use more than 1.5 GiB + if (requested > kMaxTempMem) { return kMaxTempMem; } + } + + // use whatever lower limit the user requested + return requested; + }; + + private: + /// Set of currently outstanding memory allocations per device + /// device -> (alloc request, allocated ptr) + std::unordered_map> allocs_; + + /// Temporary memory provider, per each device + std::unordered_map> tempMemory_; + + /// Our default stream that work is ordered on, one per each device + std::unordered_map defaultStreams_; + + /// This contains particular streams as set by the user for + /// ordering, if any + std::unordered_map userDefaultStreams_; + + /// Other streams we can use, per each device + std::unordered_map> alternateStreams_; + + /// Async copy stream to use for GPU <-> CPU pinned memory copies + std::unordered_map asyncCopyStreams_; + + /// cuBLAS handle for each device + std::unordered_map blasHandles_; + + /// Pinned memory allocation for use with this GPU + void* pinnedMemAlloc_; + size_t pinnedMemAllocSize_; + + /// Another option is to use a specified amount of memory on all + /// devices + size_t tempMemSize_; + + /// Amount of pinned memory we should allocate + size_t pinnedMemSize_; + + /// Whether or not we log every GPU memory allocation and deallocation + bool allocLogging_; + + // cuda_memory_resource + std::unique_ptr cmr; + + // managed_memory_resource + std::unique_ptr mmr; + + // pinned_memory_resource + std::unique_ptr pmr; +}; + +/// Default implementation of GpuResources that allocates a cuBLAS +/// stream and 2 streams for use, as well as temporary memory. +/// Internally, the Faiss GPU code uses the instance managed by getResources, +/// but this is the user-facing object that is internally reference counted. +class RmmGpuResources : public GpuResourcesProvider { + public: + RmmGpuResources() : res_(new RmmGpuResourcesImpl){}; + + ~RmmGpuResources(){}; + + std::shared_ptr getResources() { return res_; }; + + /// Disable allocation of temporary memory; all temporary memory + /// requests will call cudaMalloc / cudaFree at the point of use + void noTempMemory() { res_->noTempMemory(); }; + + /// Specify that we wish to use a certain fixed size of memory on + /// all devices as temporary memory. This is the upper bound for the GPU + /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; + /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. + /// To avoid any temporary memory allocation, pass 0. + void setTempMemory(size_t size) { res_->setTempMemory(size); }; + + /// Set amount of pinned memory to allocate, for async GPU <-> CPU + /// transfers + void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); }; + + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + void setDefaultStream(int device, cudaStream_t stream) + { + res_->setDefaultStream(device, stream); + }; + + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device) { res_->revertDefaultStream(device); }; + + /// Called to change the work ordering streams to the null stream + /// for all devices + void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); }; + + /// Export a description of memory used for Python + std::map>> getMemoryInfo() const + { + return res_->getMemoryInfo(); + }; + + /// Returns the current default stream + cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); }; + + /// Returns the current amount of temp memory available + size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); }; + + /// Synchronize our default stream with the CPU + void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); }; + + /// If enabled, will print every GPU memory allocation and deallocation to + /// standard output + void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); }; + + private: + std::shared_ptr res_; +}; + +} // namespace gpu +} // namespace faiss \ No newline at end of file diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt index def3ef3151..3eb454c95f 100644 --- a/faiss/gpu/test/CMakeLists.txt +++ b/faiss/gpu/test/CMakeLists.txt @@ -10,7 +10,7 @@ find_package(CUDAToolkit REQUIRED) include(GoogleTest) add_library(faiss_gpu_test_helper TestUtils.cpp) -target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart) +target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$:raft::raft>) macro(faiss_gpu_test file) get_filename_component(test_name ${file} NAME_WE) @@ -29,6 +29,11 @@ faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp) faiss_gpu_test(TestGpuDistance.cu) faiss_gpu_test(TestGpuSelect.cu) + +if(FAISS_ENABLE_RAFT) + faiss_gpu_test(TestRaftIndexIVFFlat.cpp) +endif() + add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL demo_ivfpq_indexing_gpu.cpp) diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp new file mode 100644 index 0000000000..1794e9da6d --- /dev/null +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -0,0 +1,555 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// FIXME: figure out a better way to test fp16 +constexpr float kF16MaxRelErr = 0.3f; +constexpr float kF32MaxRelErr = 0.03f; + +struct Options { + Options() { + numAdd = 2 * faiss::gpu::randVal(2000, 5000); + dim = faiss::gpu::randVal(64, 200); + + numCentroids = std::sqrt((float)numAdd / 2); + numTrain = numCentroids * 40; + nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); + numQuery = faiss::gpu::randVal(32, 100); + + // Due to the approximate nature of the query and of floating point + // differences between GPU and CPU, to stay within our error bounds, + // only use a small k + k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40); + indicesOpt = faiss::gpu::randSelect( + {faiss::gpu::INDICES_CPU, + faiss::gpu::INDICES_32_BIT, + faiss::gpu::INDICES_64_BIT}); + + device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + } + + std::string toString() const { + std::stringstream str; + str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " + << dim << " numCentroids " << numCentroids << " nprobe " << nprobe + << " numQuery " << numQuery << " k " << k << " indicesOpt " + << indicesOpt; + + return str.str(); + } + + int numAdd; + int dim; + int numCentroids; + int numTrain; + int nprobe; + int numQuery; + int k; + int device; + faiss::gpu::IndicesOptions indicesOpt; +}; + +void queryTest( + faiss::MetricType metricType, + bool useFloat16CoarseQuantizer, + int dimOverride = -1) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.dim = dimOverride != -1 ? dimOverride : opt.dim; + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 quantizerL2(opt.dim); + faiss::IndexFlatIP quantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; + + faiss::IndexIVFFlat cpuIndex( + quantizer, opt.dim, opt.numCentroids, metricType); + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + cpuIndex.nprobe = opt.nprobe; + + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.setNumProbes(opt.nprobe); + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.65f : 0.015f); + } +} + +void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 quantizerL2(opt.dim); + faiss::IndexFlatIP quantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; + + faiss::IndexIVFFlat cpuIndex( + quantizer, opt.dim, opt.numCentroids, metricType); + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.nprobe = opt.nprobe; + + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.setNumProbes(opt.nprobe); + + cpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); + } +} + +void copyToTest(bool useFloat16CoarseQuantizer) { + Options opt; + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.setNumProbes(opt.nprobe); + + // use garbage values to see if we overwrite then + faiss::IndexFlatL2 cpuQuantizer(1); + faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); + cpuIndex.nprobe = 1; + + gpuIndex.copyTo(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); +} + +void copyFromTest(bool useFloat16CoarseQuantizer) { + Options opt; + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 cpuQuantizer(opt.dim); + faiss::IndexIVFFlat cpuIndex( + &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2); + cpuIndex.nprobe = opt.nprobe; + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + + // use garbage values to see if we overwrite then + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); + gpuIndex.setNumProbes(1); + + gpuIndex.copyFrom(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); +} + +//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { +//addTest(faiss::METRIC_L2, false); +//} +// +//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { +//addTest(faiss::METRIC_INNER_PRODUCT, false); +//} +// +//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { +//addTest(faiss::METRIC_L2, true); +//} +// +//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { +//addTest(faiss::METRIC_INNER_PRODUCT, true); +//} + +// +// General query tests +// + +TEST(TestRaftIndexIVFFlat, Float32_Query_L2) { +queryTest(faiss::METRIC_L2, false); +} + +TEST(TestRaftIndexIVFFlat, Float32_Query_IP) { +queryTest(faiss::METRIC_INNER_PRODUCT, false); +} + +// float16 coarse quantizer + +TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) { +queryTest(faiss::METRIC_L2, true); +} + +TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) { +queryTest(faiss::METRIC_INNER_PRODUCT, true); +} + +// +// There are IVF list scanning specializations for 64-d and 128-d that we +// make sure we explicitly test here +// + +TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) { +queryTest(faiss::METRIC_L2, false, 64); +} + +TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) { +queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); +} + +TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { +queryTest(faiss::METRIC_L2, false, 128); +} + +TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { +queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); +} + +// +// Copy tests +// + +TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) { +copyToTest(false); +} + +TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) { +copyFromTest(false); +} + +TEST(TestRaftIndexIVFFlat, Float32_negative) { +Options opt; + +auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); +auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + +// Put all vecs on negative side +for (auto& f : trainVecs) { +f = std::abs(f) * -1.0f; +} + +for (auto& f : addVecs) { +f *= std::abs(f) * -1.0f; +} + +faiss::IndexFlatIP quantizerIP(opt.dim); +faiss::Index* quantizer = (faiss::Index*)&quantizerIP; + +faiss::IndexIVFFlat cpuIndex( + quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); +cpuIndex.train(opt.numTrain, trainVecs.data()); +cpuIndex.add(opt.numAdd, addVecs.data()); +cpuIndex.nprobe = opt.nprobe; + +faiss::gpu::RmmGpuResources res; +res.noTempMemory(); + +faiss::gpu::GpuIndexIVFFlatConfig config; +config.device = opt.device; +config.indicesOptions = opt.indicesOpt; + +faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); +gpuIndex.copyFrom(&cpuIndex); +gpuIndex.setNumProbes(opt.nprobe); + +// Construct a positive test set +auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + +// Put all vecs on positive size +for (auto& f : queryVecs) { +f = std::abs(f); +} + +bool compFloat16 = false; +faiss::gpu::compareIndices( + queryVecs, + cpuIndex, + gpuIndex, + opt.numQuery, +opt.dim, +opt.k, +opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, +// FIXME: the fp16 bounds are +// useless when math (the accumulator) is +// in fp16. Figure out another way to test +compFloat16 ? 0.99f : 0.1f, +compFloat16 ? 0.65f : 0.015f); +} + +// +// NaN tests +// + +TEST(TestRaftIndexIVFFlat, QueryNaN) { +Options opt; + +std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); +std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + +faiss::gpu::RmmGpuResources res; +res.noTempMemory(); + +faiss::gpu::GpuIndexIVFFlatConfig config; +config.device = opt.device; +config.indicesOptions = opt.indicesOpt; +config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +gpuIndex.setNumProbes(opt.nprobe); + +gpuIndex.train(opt.numTrain, trainVecs.data()); +gpuIndex.add(opt.numAdd, addVecs.data()); + +int numQuery = 10; +std::vector nans( + numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + +std::vector distances(numQuery * opt.k, 0); +std::vector indices(numQuery * opt.k, 0); + +gpuIndex.search( + numQuery, nans.data(), opt.k, distances.data(), indices.data()); + +for (int q = 0; q < numQuery; ++q) { +for (int k = 0; k < opt.k; ++k) { +EXPECT_EQ(indices[q * opt.k + k], -1); +EXPECT_EQ( + distances[q * opt.k + k], + std::numeric_limits::max()); +} +} +} + +TEST(TestRaftIndexIVFFlat, AddNaN) { +Options opt; + +faiss::gpu::RmmGpuResources res; +res.noTempMemory(); + +faiss::gpu::GpuIndexIVFFlatConfig config; +config.device = opt.device; +config.indicesOptions = opt.indicesOpt; +config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +gpuIndex.setNumProbes(opt.nprobe); + +int numNans = 10; +std::vector nans( + numNans * opt.dim, std::numeric_limits::quiet_NaN()); + +// Make one vector valid (not the first vector, in order to test offset +// issues), which should actually add +for (int i = 0; i < opt.dim; ++i) { +nans[opt.dim + i] = i; +} + +std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); +gpuIndex.train(opt.numTrain, trainVecs.data()); + +// should not crash +EXPECT_EQ(gpuIndex.ntotal, 0); +gpuIndex.add(numNans, nans.data()); + +std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); +std::vector distance(opt.numQuery * opt.k, 0); +std::vector indices(opt.numQuery * opt.k, 0); + +// should not crash +gpuIndex.search( + opt.numQuery, +queryVecs.data(), + opt.k, +distance.data(), + indices.data()); +} + +TEST(TestRaftIndexIVFFlat, UnifiedMemory) { +// Construct on a random device to test multi-device, if we have +// multiple devices +int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + +if (!faiss::gpu::getFullUnifiedMemSupport(device)) { +return; +} + +int dim = 128; + +int numCentroids = 256; +// Unfortunately it would take forever to add 24 GB in IVFPQ data, +// so just perform a small test with data allocated in the unified +// memory address space +size_t numAdd = 10000; +size_t numTrain = numCentroids * 40; +int numQuery = 10; +int k = 10; +int nprobe = 8; + +std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); +std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); + +faiss::IndexFlatL2 quantizer(dim); +faiss::IndexIVFFlat cpuIndex( + &quantizer, dim, numCentroids, faiss::METRIC_L2); + +cpuIndex.train(numTrain, trainVecs.data()); +cpuIndex.add(numAdd, addVecs.data()); +cpuIndex.nprobe = nprobe; + +faiss::gpu::RmmGpuResources res; +res.noTempMemory(); + +faiss::gpu::GpuIndexIVFFlatConfig config; +config.device = device; +config.memorySpace = faiss::gpu::MemorySpace::Unified; + +faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); +gpuIndex.copyFrom(&cpuIndex); +gpuIndex.setNumProbes(nprobe); + +faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, +"Unified Memory", +kF32MaxRelErr, +0.1f, +0.015f); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + + // just run with a fixed test seed + faiss::gpu::setTestSeed(100); + + return RUN_ALL_TESTS(); +} From 884bfa5afbcdd38c0178d3f57b80e4996e12813f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 1 Jul 2022 16:55:36 -0400 Subject: [PATCH 11/87] iUpdating function calls for copyFrom to include populating the quantizer and ivf lists --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index ffc0b3e2c4..0f6e9bcf99 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -97,7 +97,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // TODO: Invoke corresponding call on the RAFT side to copy quantizer /** * For example: - * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index( + * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index( * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); */ } @@ -112,8 +112,13 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { // TODO: We need to reserve memory on the raft::ivf_flat::index /** * For example: - * raft::spatial::knn::ivf_flat::ivf_flat_allocate_ivf_lists( + * raft::spatial::knn::ivf_flat::allocate_ivf_lists( * raft_handle, *raft_knn_index, numVecs); + * + * raft::spatial::knn::ivf_flat::populate( + * raft_handle, *raft_knn_index, + * n_centroids, centroids, + * n_vectors, ivf); */ } } @@ -124,7 +129,7 @@ size_t RaftIndexIVFFlat::reclaimMemory() { // TODO: We need to reclaim memory on the raft::ivf_flat::index /** * For example: - * raft::spatial::knn::ivf_flat::ivf_flat_reclaim_ivf_lists( + * raft::spatial::knn::ivf_flat::reclaim_ivf_lists( * raft_handle, *raft_knn_index, numVecs); */ return 0; @@ -153,10 +158,10 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { /** * For example: * - * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_ivf_flat_index( + * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index( * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); - * raft::spatial::knn::ivf_flat::ivf_flat_train_quantizer( + * raft::spatial::knn::ivf_flat::train_quantizer( * raft_handle, *raft_knn_index, const_cast(x), n); */ @@ -241,7 +246,7 @@ void RaftIndexIVFFlat::addImpl_( // TODO: Invoke corresponding call in raft::ivf_flat /** * For example: - * raft::spatial::knn::ivf_flat::ivf_flat_add_vectors( + * raft::spatial::knn::ivf_flat::add_vectors( * raft_handle, *raft_knn_index, n, x, xids); */ From 0958d2e47b5c42e8af79ae379eb57b2a68f7fbdf Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 4 Jul 2022 17:29:09 +0200 Subject: [PATCH 12/87] Implement some helpers --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 33 +++++++++++------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 0f6e9bcf99..5011109491 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -178,13 +178,7 @@ int RaftIndexIVFFlat::getListLength(int listId) const { FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - // TODO: Call function in RAFT to do this. - /** - * For example: - * raft::spatial::knn::ivf_flat::get_list_length( - * raft_handle, *raft_knn_index, listId); - */ - return 0; + return int(raft_knn_index->list_sizes[listId]); } std::vector RaftIndexIVFFlat::getListVectorData( @@ -193,13 +187,13 @@ std::vector RaftIndexIVFFlat::getListVectorData( FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - // TODO: Invoke corresponding call in raft::ivf_flat - /** - * For example: - * raft::spatial::knn::ivf_flat::get_list_vector_data( - * raft_handle, *raft_knn_index, listId, gpuFormat); - */ - std::vector vec; + using elem_t = decltype(raft_knn_index->data)::element_type; + size_t dim = raft_knn_index->dim(); + size_t byte_offset = size_t(raft_knn_index->list_offsets[listId]) * sizeof(elem_t) * dim; + // the interleaved block can be slightly larger than the list size (it's rounded up) + size_t byte_size = size_t(raft_knn_index->list_offsets[listId + 1]) * sizeof(elem_t) * dim - byte_offset; + std::vector vec(byte_size); + raft::copy(vec.data(), reinterpret_cast(raft_knn_index->data.data()) + byte_offset, byte_size); return vec; } @@ -212,13 +206,10 @@ std::vector RaftIndexIVFFlat::getListIndices(int listId) const { FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - // TODO: Need to invoke corresponding call in raft::ivf_flat - /** - * For example: - * raft::spatial::knn::ivf_flat::get_list_indices( - * raft_handle, *raft_knn_index, listId); - */ - std::vector vec; + size_t offset = raft_knn_index->list_offsets[listId]; + size_t size = raft_knn_index->list_sizes[listId]; + std::vector vec(size); + raft::copy(vec.data(), raft_knn_index->indices.data() + offset, size); return vec; } From b7144a99cccb18f3bf883270c9c48d04d86c25c9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 5 Jul 2022 15:00:45 +0200 Subject: [PATCH 13/87] Make it compile --- CMakeLists.txt | 2 +- build.sh | 41 ++++++++++++ faiss/gpu/CMakeLists.txt | 4 +- faiss/gpu/raft/RaftIndexIVFFlat.cu | 104 ++++++++++++++++------------- 4 files changed, 103 insertions(+), 48 deletions(-) create mode 100755 build.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 71a05ab7dc..750cba414e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ project(faiss LANGUAGES CXX) include(GNUInstallDirs) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/build.sh b/build.sh new file mode 100755 index 0000000000..7ff0577e29 --- /dev/null +++ b/build.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +BUILD_TYPE=Debug + +RAFT_REPO_REL="../raft" +RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`" + +set -e + +if [ "$1" == "clean" ]; then + rm -rf build + exit 0 +fi + +if [ "$1" == "test" ]; then + make -C build -j test + exit 0 +fi + +if [ "$1" == "test-raft" ]; then + ./build/faiss/gpu/test/TestRaftIndexIVFFlat + exit 0 +fi + +cmake \ + -DFAISS_ENABLE_GPU=ON \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DBUILD_TESTING=ON \ + -DBUILD_SHARED_LIBS=OFF \ + -DFAISS_ENABLE_RAFT=ON \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCPM_raft_SOURCE="${RAFT_REPO_PATH}" \ + -DFAISS_OPT_LEVEL=avx2 \ + -DCMAKE_CUDA_ARCHITECTURES="86" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -B build . + +make -C build -j diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 3ed26dca01..f157e6e7ec 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -182,6 +182,6 @@ endforeach() find_package(CUDAToolkit REQUIRED) target_link_libraries(faiss PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft>) -target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas) +target_link_libraries(faiss_avx2 PRIVATE CUDA::cudart CUDA::cublas $<$:raft::raft>) target_compile_options(faiss PRIVATE $<$:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>) -target_compile_options(faiss_avx2 PRIVATE $<$:-Xfatbin=-compress-all>) +target_compile_options(faiss_avx2 PRIVATE $<$:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 5011109491..d2114ff004 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -9,8 +9,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -26,11 +26,8 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( GpuResourcesProvider* provider, const faiss::IndexIVFFlat* index, GpuIndexIVFFlatConfig config) - : GpuIndexIVFFlat( - provider, - index, - config), raft_handle(resources_->getDefaultStream(config_.device)) { - + : GpuIndexIVFFlat(provider, index, config), + raft_handle(resources_->getDefaultStream(config_.device)) { copyFrom(index); } @@ -42,17 +39,16 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( GpuIndexIVFFlatConfig config) : GpuIndexIVFFlat(provider, dims, nlist, metric, config), raft_handle(resources_->getDefaultStream(config_.device)) { - this->is_trained = false; } RaftIndexIVFFlat::~RaftIndexIVFFlat() {} void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - printf("Copying from...\n"); - // TODO: Need to copy necessary memory from the index and set any needed params. + // TODO: Need to copy necessary memory from the index and set any needed + // params. DeviceScope scope(config_.device); GpuIndex::copyFrom(index); @@ -71,11 +67,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { index->nprobe); nprobe = index->nprobe; - config.device = config_.device; - - FAISS_ASSERT(metric_type != faiss::METRIC_L2 && - metric_type != faiss::METRIC_INNER_PRODUCT); - + // config.device = config_.device; if (!index->is_trained) { // copied in GpuIndex::copyFrom FAISS_ASSERT(!is_trained && ntotal == 0); @@ -92,7 +84,17 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { raft::spatial::knn::ivf_flat::index_params raft_idx_params; raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + + switch (metric_type) { + case faiss::METRIC_L2: + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + break; + case faiss::METRIC_INNER_PRODUCT: + raft_idx_params.metric = raft::distance::DistanceType::InnerProduct; + break; + default: + FAISS_THROW_MSG("Metric is not supported."); + } // TODO: Invoke corresponding call on the RAFT side to copy quantizer /** @@ -103,7 +105,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { } void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { - std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl; reserveMemoryVecs_ = numVecs; if (raft_knn_index.has_value()) { @@ -153,7 +154,6 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - // TODO: This should only train the quantizer portion of the index /** * For example: @@ -165,11 +165,12 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { * raft_handle, *raft_knn_index, const_cast(x), n); */ - raft_knn_index.emplace( - raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, - const_cast(x), - n, (faiss::Index::idx_t)d, - raft_handle.get_stream())); + raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build( + raft_handle, + raft_idx_params, + const_cast(x), + n, + (faiss::Index::idx_t)d)); raft_handle.sync_stream(); } @@ -178,7 +179,7 @@ int RaftIndexIVFFlat::getListLength(int listId) const { FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - return int(raft_knn_index->list_sizes[listId]); + return int(raft_knn_index->list_sizes(listId)); } std::vector RaftIndexIVFFlat::getListVectorData( @@ -189,11 +190,20 @@ std::vector RaftIndexIVFFlat::getListVectorData( using elem_t = decltype(raft_knn_index->data)::element_type; size_t dim = raft_knn_index->dim(); - size_t byte_offset = size_t(raft_knn_index->list_offsets[listId]) * sizeof(elem_t) * dim; - // the interleaved block can be slightly larger than the list size (it's rounded up) - size_t byte_size = size_t(raft_knn_index->list_offsets[listId + 1]) * sizeof(elem_t) * dim - byte_offset; + size_t byte_offset = + size_t(raft_knn_index->list_offsets(listId)) * sizeof(elem_t) * dim; + // the interleaved block can be slightly larger than the list size (it's + // rounded up) + size_t byte_size = size_t(raft_knn_index->list_offsets(listId + 1)) * + sizeof(elem_t) * dim - + byte_offset; std::vector vec(byte_size); - raft::copy(vec.data(), reinterpret_cast(raft_knn_index->data.data()) + byte_offset, byte_size); + raft::copy( + vec.data(), + reinterpret_cast(raft_knn_index->data.data()) + + byte_offset, + byte_size, + raft_handle.get_stream()); return vec; } @@ -206,10 +216,14 @@ std::vector RaftIndexIVFFlat::getListIndices(int listId) const { FAISS_ASSERT(raft_knn_index.has_value()); DeviceScope scope(config_.device); - size_t offset = raft_knn_index->list_offsets[listId]; - size_t size = raft_knn_index->list_sizes[listId]; + size_t offset = raft_knn_index->list_offsets(listId); + size_t size = raft_knn_index->list_sizes(listId); std::vector vec(size); - raft::copy(vec.data(), raft_knn_index->indices.data() + offset, size); + raft::copy( + vec.data(), + raft_knn_index->indices.data() + offset, + size, + raft_handle.get_stream()); return vec; } @@ -221,15 +235,15 @@ void RaftIndexIVFFlat::addImpl_( FAISS_ASSERT(raft_knn_index.has_value()); FAISS_ASSERT(n > 0); - // Data is already resident on the GPU + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int)this->d}); Tensor labels(const_cast(xids), {n}); -// // Not all vectors may be able to be added (some may contain NaNs etc) -// index_->addVectors(data, labels); -// -// // but keep the ntotal based on the total number of vectors that we -// // attempted to add + // // Not all vectors may be able to be added (some may contain NaNs etc) + // index_->addVectors(data, labels); + // + // // but keep the ntotal based on the total number of vectors that we + // // attempted to add ntotal += n; std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; @@ -240,7 +254,6 @@ void RaftIndexIVFFlat::addImpl_( * raft::spatial::knn::ivf_flat::add_vectors( * raft_handle, *raft_knn_index, n, x, xids); */ - } void RaftIndexIVFFlat::searchImpl_( @@ -264,14 +277,15 @@ void RaftIndexIVFFlat::searchImpl_( raft::spatial::knn::ivf_flat::search_params raft_idx_params; raft_idx_params.n_probes = nprobe; - raft::spatial::knn::ivf_flat::search(raft_handle, - raft_idx_params, - *raft_knn_index, - const_cast(x), - static_cast(n), - static_cast(k), - static_cast(labels), - distances, raft_handle.get_stream()); + raft::spatial::knn::ivf_flat::search( + raft_handle, + raft_idx_params, + *raft_knn_index, + const_cast(x), + static_cast(n), + static_cast(k), + static_cast(labels), + distances); raft_handle.sync_stream(); } From 38733bb42c2c1cceb2da63a3671093c13e1d466d Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 6 Jul 2022 12:11:05 +0200 Subject: [PATCH 14/87] Make the tests to not crash... sometimes --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 171 ++++------ faiss/gpu/raft/RaftIndexIVFFlat.h | 2 + faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 432 ++++++++++++------------ 3 files changed, 296 insertions(+), 309 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index d2114ff004..01c5fc028b 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -38,70 +38,49 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( faiss::MetricType metric, GpuIndexIVFFlatConfig config) : GpuIndexIVFFlat(provider, dims, nlist, metric, config), - raft_handle(resources_->getDefaultStream(config_.device)) { - this->is_trained = false; -} + raft_handle(resources_->getDefaultStream(config_.device)) {} -RaftIndexIVFFlat::~RaftIndexIVFFlat() {} +RaftIndexIVFFlat::~RaftIndexIVFFlat() { + RaftIndexIVFFlat::reset(); +} void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - printf("Copying from...\n"); - - // TODO: Need to copy necessary memory from the index and set any needed - // params. DeviceScope scope(config_.device); - GpuIndex::copyFrom(index); - FAISS_ASSERT(index->nlist > 0); FAISS_THROW_IF_NOT_FMT( index->nlist <= (Index::idx_t)std::numeric_limits::max(), "GPU index only supports %zu inverted lists", (size_t)std::numeric_limits::max()); - nlist = index->nlist; - FAISS_THROW_IF_NOT_FMT( index->nprobe > 0 && index->nprobe <= getMaxKSelection(), "GPU index only supports nprobe <= %zu; passed %zu", (size_t)getMaxKSelection(), index->nprobe); - nprobe = index->nprobe; - // config.device = config_.device; - if (!index->is_trained) { - // copied in GpuIndex::copyFrom - FAISS_ASSERT(!is_trained && ntotal == 0); - return; + if (index->is_trained && index->ntotal > 0) { + // TODO: A proper copy of the index without retraining + // For now, just get all the data from the index, and train our index + // anew. + auto stream = raft_handle.get_stream(); + auto total_elems = size_t(index->ntotal) * size_t(index->d); + rmm::device_uvector buf_dev(total_elems, stream); + { + std::vector buf_host(total_elems); + index->reconstruct_n(0, index->ntotal, buf_host.data()); + raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); + } + FAISS_ASSERT(index->d == this->d); + FAISS_ASSERT(index->metric_arg == this->metric_arg); + FAISS_ASSERT(index->metric_type == this->metric_type); + FAISS_ASSERT(index->nlist == this->nlist); + RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), index->ntotal); + } else { + // index is not trained, so we can remove ours as well (if there was + // any) + raft_knn_index.reset(); } - - // copied in GpuIndex::copyFrom - // ntotal can exceed max int, but the number of vectors per inverted - // list cannot exceed this. We check this in the subclasses. - FAISS_ASSERT(is_trained && (ntotal == index->ntotal)); - - // Since we're trained, the quantizer must have data - FAISS_ASSERT(index->quantizer->ntotal > 0); - - raft::spatial::knn::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - - switch (metric_type) { - case faiss::METRIC_L2: - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - break; - case faiss::METRIC_INNER_PRODUCT: - raft_idx_params.metric = raft::distance::DistanceType::InnerProduct; - break; - default: - FAISS_THROW_MSG("Metric is not supported."); - } - - // TODO: Invoke corresponding call on the RAFT side to copy quantizer - /** - * For example: - * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index( - * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); - */ + this->is_trained = index->is_trained; } void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { @@ -137,23 +116,8 @@ size_t RaftIndexIVFFlat::reclaimMemory() { } void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { - // For now, only support <= max int results - FAISS_THROW_IF_NOT_FMT( - n <= (Index::idx_t)std::numeric_limits::max(), - "GPU index only supports up to %d indices", - std::numeric_limits::max()); - DeviceScope scope(config_.device); - if (this->is_trained) { - FAISS_ASSERT(raft_knn_index.has_value()); - return; - } - - raft::spatial::knn::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - // TODO: This should only train the quantizer portion of the index /** * For example: @@ -163,16 +127,11 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { * raft::spatial::knn::ivf_flat::train_quantizer( * raft_handle, *raft_knn_index, const_cast(x), n); + * + * NB: ivf_flat does not have a quantizer. Training here imply kmeans? */ - raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build( - raft_handle, - raft_idx_params, - const_cast(x), - n, - (faiss::Index::idx_t)d)); - - raft_handle.sync_stream(); + RaftIndexIVFFlat::rebuildRaftIndex(x, n); } int RaftIndexIVFFlat::getListLength(int listId) const { @@ -208,8 +167,8 @@ std::vector RaftIndexIVFFlat::getListVectorData( } void RaftIndexIVFFlat::reset() { - std::cout << "Calling reset()" << std::endl; raft_knn_index.reset(); + this->ntotal = 0; } std::vector RaftIndexIVFFlat::getListIndices(int listId) const { @@ -232,28 +191,20 @@ void RaftIndexIVFFlat::addImpl_( const float* x, const Index::idx_t* xids) { // Device is already set in GpuIndex::add - FAISS_ASSERT(raft_knn_index.has_value()); + FAISS_ASSERT(is_trained); FAISS_ASSERT(n > 0); + /* TODO: + At the moment, raft does not support adding vectors, and does not support + providing indices with the vectors even in training - // Data is already resident on the GPU - Tensor data(const_cast(x), {n, (int)this->d}); - Tensor labels(const_cast(xids), {n}); - - // // Not all vectors may be able to be added (some may contain NaNs etc) - // index_->addVectors(data, labels); - // - // // but keep the ntotal based on the total number of vectors that we - // // attempted to add - ntotal += n; - - std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; - - // TODO: Invoke corresponding call in raft::ivf_flat - /** - * For example: - * raft::spatial::knn::ivf_flat::add_vectors( - * raft_handle, *raft_knn_index, n, x, xids); + For now, just do the training anew */ + raft_knn_index.reset(); + + // Not all vectors may be able to be added (some may contain NaNs etc) + // but keep the ntotal based on the total number of vectors that we + // attempted to add index_->addVectors(data, labels); + RaftIndexIVFFlat::rebuildRaftIndex(x, n); } void RaftIndexIVFFlat::searchImpl_( @@ -267,28 +218,44 @@ void RaftIndexIVFFlat::searchImpl_( FAISS_ASSERT(n > 0); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); - // Data is already resident on the GPU - Tensor queries(const_cast(x), {n, (int)this->d}); - Tensor outDistances(distances, {n, k}); - Tensor outLabels( - const_cast(labels), {n, k}); - - // TODO: Populate the rest of the params properly. - raft::spatial::knn::ivf_flat::search_params raft_idx_params; - raft_idx_params.n_probes = nprobe; - + raft::spatial::knn::ivf_flat::search_params pams; + pams.n_probes = nprobe; raft::spatial::knn::ivf_flat::search( raft_handle, - raft_idx_params, + pams, *raft_knn_index, const_cast(x), static_cast(n), static_cast(k), - static_cast(labels), + labels, distances); raft_handle.sync_stream(); } +void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) { + raft::spatial::knn::ivf_flat::index_params pams; + + pams.n_lists = this->nlist; + switch (this->metric_type) { + case faiss::METRIC_L2: + pams.metric = raft::distance::DistanceType::L2Expanded; + break; + case faiss::METRIC_INNER_PRODUCT: + pams.metric = raft::distance::DistanceType::InnerProduct; + break; + default: + FAISS_THROW_MSG("Metric is not supported."); + } + pams.metric_arg = this->metric_arg; + pams.kmeans_trainset_fraction = 1.0; + + raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build( + this->raft_handle, pams, x, n_rows, uint32_t(this->d))); + this->raft_handle.sync_stream(); + this->is_trained = true; + this->ntotal = n_rows; +} + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index 4960fa3ae1..cd97f426df 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -92,6 +92,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { float* distances, Index::idx_t* labels) const override; + void rebuildRaftIndex(const float* x, Index::idx_t n_rows); + const raft::handle_t raft_handle; std::optional> raft_knn_index{std::nullopt}; }; diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index 1794e9da6d..9df27b2f3d 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -78,8 +78,8 @@ void queryTest( faiss::IndexFlatL2 quantizerL2(opt.dim); faiss::IndexFlatIP quantizerIP(opt.dim); faiss::Index* quantizer = metricType == faiss::METRIC_L2 - ? (faiss::Index*)&quantizerL2 - : (faiss::Index*)&quantizerIP; + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; faiss::IndexIVFFlat cpuIndex( quantizer, opt.dim, opt.numCentroids, metricType); @@ -128,8 +128,8 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { faiss::IndexFlatL2 quantizerL2(opt.dim); faiss::IndexFlatIP quantizerIP(opt.dim); faiss::Index* quantizer = metricType == faiss::METRIC_L2 - ? (faiss::Index*)&quantizerL2 - : (faiss::Index*)&quantizerIP; + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; faiss::IndexIVFFlat cpuIndex( quantizer, opt.dim, opt.numCentroids, metricType); @@ -267,42 +267,50 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { compFloat16 ? 0.30f : 0.015f); } -//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { -//addTest(faiss::METRIC_L2, false); -//} -// -//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { -//addTest(faiss::METRIC_INNER_PRODUCT, false); -//} -// -//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { -//addTest(faiss::METRIC_L2, true); -//} -// -//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { -//addTest(faiss::METRIC_INNER_PRODUCT, true); -//} +TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { + addTest(faiss::METRIC_L2, false); + printf("Finished addTest(faiss::METRIC_L2, false)\n"); +} + +TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { + addTest(faiss::METRIC_INNER_PRODUCT, false); + printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); +} + +TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { + addTest(faiss::METRIC_L2, true); + printf("Finished addTest(faiss::METRIC_L2, true)\n"); +} + +TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { + addTest(faiss::METRIC_INNER_PRODUCT, true); + printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); +} // // General query tests // TEST(TestRaftIndexIVFFlat, Float32_Query_L2) { -queryTest(faiss::METRIC_L2, false); + queryTest(faiss::METRIC_L2, false); + printf("Finished queryTest(faiss::METRIC_L2, false);\n"); } TEST(TestRaftIndexIVFFlat, Float32_Query_IP) { -queryTest(faiss::METRIC_INNER_PRODUCT, false); + queryTest(faiss::METRIC_INNER_PRODUCT, false); + printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); } // float16 coarse quantizer TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) { -queryTest(faiss::METRIC_L2, true); + queryTest(faiss::METRIC_L2, true); + printf("Finished queryTest(faiss::METRIC_L2, true)\n"); } TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) { -queryTest(faiss::METRIC_INNER_PRODUCT, true); + queryTest(faiss::METRIC_INNER_PRODUCT, true); + printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); } // @@ -311,238 +319,248 @@ queryTest(faiss::METRIC_INNER_PRODUCT, true); // TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) { -queryTest(faiss::METRIC_L2, false, 64); + queryTest(faiss::METRIC_L2, false, 64); + printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n"); } TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) { -queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); + queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); + printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); } TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { -queryTest(faiss::METRIC_L2, false, 128); + queryTest(faiss::METRIC_L2, false, 128); + printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n"); } TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { -queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); + queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); + printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); } // // Copy tests // -TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) { -copyToTest(false); -} +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) { +// copyToTest(false); +// printf("Finished copyToTest(false)\n"); +// } TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) { -copyFromTest(false); + copyFromTest(false); + printf("Finished copyFromTest(false)\n"); } TEST(TestRaftIndexIVFFlat, Float32_negative) { -Options opt; + Options opt; -auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); -auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); -// Put all vecs on negative side -for (auto& f : trainVecs) { -f = std::abs(f) * -1.0f; -} + // Put all vecs on negative side + for (auto& f : trainVecs) { + f = std::abs(f) * -1.0f; + } -for (auto& f : addVecs) { -f *= std::abs(f) * -1.0f; -} + for (auto& f : addVecs) { + f *= std::abs(f) * -1.0f; + } -faiss::IndexFlatIP quantizerIP(opt.dim); -faiss::Index* quantizer = (faiss::Index*)&quantizerIP; + faiss::IndexFlatIP quantizerIP(opt.dim); + faiss::Index* quantizer = (faiss::Index*)&quantizerIP; -faiss::IndexIVFFlat cpuIndex( - quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); -cpuIndex.train(opt.numTrain, trainVecs.data()); -cpuIndex.add(opt.numAdd, addVecs.data()); -cpuIndex.nprobe = opt.nprobe; + faiss::IndexIVFFlat cpuIndex( + quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + cpuIndex.nprobe = opt.nprobe; -faiss::gpu::RmmGpuResources res; -res.noTempMemory(); + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); -faiss::gpu::GpuIndexIVFFlatConfig config; -config.device = opt.device; -config.indicesOptions = opt.indicesOpt; + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; -faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); -gpuIndex.copyFrom(&cpuIndex); -gpuIndex.setNumProbes(opt.nprobe); + faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.setNumProbes(opt.nprobe); -// Construct a positive test set -auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + // Construct a positive test set + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); -// Put all vecs on positive size -for (auto& f : queryVecs) { -f = std::abs(f); -} + // Put all vecs on positive size + for (auto& f : queryVecs) { + f = std::abs(f); + } -bool compFloat16 = false; -faiss::gpu::compareIndices( - queryVecs, - cpuIndex, - gpuIndex, - opt.numQuery, -opt.dim, -opt.k, -opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, -// FIXME: the fp16 bounds are -// useless when math (the accumulator) is -// in fp16. Figure out another way to test -compFloat16 ? 0.99f : 0.1f, -compFloat16 ? 0.65f : 0.015f); + bool compFloat16 = false; + faiss::gpu::compareIndices( + queryVecs, + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.99f : 0.1f, + compFloat16 ? 0.65f : 0.015f); } // // NaN tests // -TEST(TestRaftIndexIVFFlat, QueryNaN) { -Options opt; - -std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); -std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - -faiss::gpu::RmmGpuResources res; -res.noTempMemory(); - -faiss::gpu::GpuIndexIVFFlatConfig config; -config.device = opt.device; -config.indicesOptions = opt.indicesOpt; -config.flatConfig.useFloat16 = faiss::gpu::randBool(); - -faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -gpuIndex.setNumProbes(opt.nprobe); - -gpuIndex.train(opt.numTrain, trainVecs.data()); -gpuIndex.add(opt.numAdd, addVecs.data()); - -int numQuery = 10; -std::vector nans( - numQuery * opt.dim, std::numeric_limits::quiet_NaN()); - -std::vector distances(numQuery * opt.k, 0); -std::vector indices(numQuery * opt.k, 0); - -gpuIndex.search( - numQuery, nans.data(), opt.k, distances.data(), indices.data()); - -for (int q = 0; q < numQuery; ++q) { -for (int k = 0; k < opt.k; ++k) { -EXPECT_EQ(indices[q * opt.k + k], -1); -EXPECT_EQ( - distances[q * opt.k + k], - std::numeric_limits::max()); -} -} -} - -TEST(TestRaftIndexIVFFlat, AddNaN) { -Options opt; +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFFlat, QueryNaN) { +// Options opt; + +// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, +// opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, +// opt.dim); + +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); + +// faiss::gpu::GpuIndexIVFFlatConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +// faiss::gpu::RaftIndexIVFFlat gpuIndex( +// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +// gpuIndex.setNumProbes(opt.nprobe); + +// gpuIndex.train(opt.numTrain, trainVecs.data()); +// gpuIndex.add(opt.numAdd, addVecs.data()); + +// int numQuery = 10; +// std::vector nans( +// numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + +// std::vector distances(numQuery * opt.k, 0); +// std::vector indices(numQuery * opt.k, 0); + +// gpuIndex.search( +// numQuery, nans.data(), opt.k, distances.data(), indices.data()); + +// for (int q = 0; q < numQuery; ++q) { +// for (int k = 0; k < opt.k; ++k) { +// EXPECT_EQ(indices[q * opt.k + k], -1); +// EXPECT_EQ( +// distances[q * opt.k + k], +// std::numeric_limits::max()); +// } +// } +// } + +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFFlat, AddNaN) { +// Options opt; + +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); + +// faiss::gpu::GpuIndexIVFFlatConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +// faiss::gpu::RaftIndexIVFFlat gpuIndex( +// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +// gpuIndex.setNumProbes(opt.nprobe); + +// int numNans = 10; +// std::vector nans( +// numNans * opt.dim, std::numeric_limits::quiet_NaN()); + +// // Make one vector valid (not the first vector, in order to test offset +// // issues), which should actually add +// for (int i = 0; i < opt.dim; ++i) { +// nans[opt.dim + i] = i; +// } + +// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, +// opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data()); + +// // should not crash +// EXPECT_EQ(gpuIndex.ntotal, 0); +// gpuIndex.add(numNans, nans.data()); + +// std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, +// opt.dim); std::vector distance(opt.numQuery * opt.k, 0); +// std::vector indices(opt.numQuery * opt.k, 0); + +// // should not crash +// gpuIndex.search( +// opt.numQuery, +// queryVecs.data(), +// opt.k, +// distance.data(), +// indices.data()); +// } -faiss::gpu::RmmGpuResources res; -res.noTempMemory(); - -faiss::gpu::GpuIndexIVFFlatConfig config; -config.device = opt.device; -config.indicesOptions = opt.indicesOpt; -config.flatConfig.useFloat16 = faiss::gpu::randBool(); +TEST(TestRaftIndexIVFFlat, UnifiedMemory) { + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); -faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -gpuIndex.setNumProbes(opt.nprobe); + if (!faiss::gpu::getFullUnifiedMemSupport(device)) { + return; + } -int numNans = 10; -std::vector nans( - numNans * opt.dim, std::numeric_limits::quiet_NaN()); + int dim = 128; -// Make one vector valid (not the first vector, in order to test offset -// issues), which should actually add -for (int i = 0; i < opt.dim; ++i) { -nans[opt.dim + i] = i; -} + int numCentroids = 256; + // Unfortunately it would take forever to add 24 GB in IVFPQ data, + // so just perform a small test with data allocated in the unified + // memory address space + size_t numAdd = 10000; + size_t numTrain = numCentroids * 40; + int numQuery = 10; + int k = 10; + int nprobe = 8; -std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); -gpuIndex.train(opt.numTrain, trainVecs.data()); + std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); + std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); -// should not crash -EXPECT_EQ(gpuIndex.ntotal, 0); -gpuIndex.add(numNans, nans.data()); + faiss::IndexFlatL2 quantizer(dim); + faiss::IndexIVFFlat cpuIndex( + &quantizer, dim, numCentroids, faiss::METRIC_L2); -std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); -std::vector distance(opt.numQuery * opt.k, 0); -std::vector indices(opt.numQuery * opt.k, 0); + cpuIndex.train(numTrain, trainVecs.data()); + cpuIndex.add(numAdd, addVecs.data()); + cpuIndex.nprobe = nprobe; -// should not crash -gpuIndex.search( - opt.numQuery, -queryVecs.data(), - opt.k, -distance.data(), - indices.data()); -} + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); -TEST(TestRaftIndexIVFFlat, UnifiedMemory) { -// Construct on a random device to test multi-device, if we have -// multiple devices -int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = device; + config.memorySpace = faiss::gpu::MemorySpace::Unified; -if (!faiss::gpu::getFullUnifiedMemSupport(device)) { -return; -} + faiss::gpu::RaftIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.setNumProbes(nprobe); -int dim = 128; - -int numCentroids = 256; -// Unfortunately it would take forever to add 24 GB in IVFPQ data, -// so just perform a small test with data allocated in the unified -// memory address space -size_t numAdd = 10000; -size_t numTrain = numCentroids * 40; -int numQuery = 10; -int k = 10; -int nprobe = 8; - -std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); -std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); - -faiss::IndexFlatL2 quantizer(dim); -faiss::IndexIVFFlat cpuIndex( - &quantizer, dim, numCentroids, faiss::METRIC_L2); - -cpuIndex.train(numTrain, trainVecs.data()); -cpuIndex.add(numAdd, addVecs.data()); -cpuIndex.nprobe = nprobe; - -faiss::gpu::RmmGpuResources res; -res.noTempMemory(); - -faiss::gpu::GpuIndexIVFFlatConfig config; -config.device = device; -config.memorySpace = faiss::gpu::MemorySpace::Unified; - -faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); -gpuIndex.copyFrom(&cpuIndex); -gpuIndex.setNumProbes(nprobe); - -faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, -"Unified Memory", -kF32MaxRelErr, -0.1f, -0.015f); + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); } int main(int argc, char** argv) { From 173c45960d25baf56a87ee797655aca42c0aebda Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 2 Aug 2022 18:43:18 -0400 Subject: [PATCH 15/87] Updates --- cmake/thirdparty/get_raft.cmake | 6 +-- faiss/gpu/GpuDistance.cu | 79 ++++++++++++++++------------ faiss/gpu/raft/RaftIndexIVFFlat.cu | 82 +++++++++++++++++++----------- 3 files changed, 101 insertions(+), 66 deletions(-) diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 3fc2d9ae34..782b3d71dc 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -15,9 +15,9 @@ #============================================================================= -set(RAFT_VERSION "22.04") +set(RAFT_VERSION "22.08") set(RAFT_FORK "achirkin") -set(RAFT_PINNED_TAG "fea-knn-ivf-flat") +set(RAFT_PINNED_TAG "enh-knn-ivf-flat-hide-impl") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) @@ -48,4 +48,4 @@ endfunction() find_and_configure_raft(VERSION ${RAFT_VERSION}.00 FORK ${RAFT_FORK} PINNED_TAG ${RAFT_PINNED_TAG} - ) \ No newline at end of file + ) diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index 6e480a239d..f671b9e5ec 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -102,22 +102,31 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { // Since we've guaranteed that all arguments are on device, call the // implementation - bfKnnOnDevice( - res, - device, - stream, - tVectors, - args.vectorsRowMajor, - args.vectorNorms ? &tVectorNorms : nullptr, - tQueries, - args.queriesRowMajor, - args.k, - args.metric, - args.metricArg, - tOutDistances, - tOutIntIndices, - args.ignoreOutDistances); +#if defined FAISS_ENABLE_RAFT + // TODO: When k <= 64, invoke bfknn from RAFT + if (args.k <= 64) { + + } else +#endif + + { + bfKnnOnDevice( + res, + device, + stream, + tVectors, + args.vectorsRowMajor, + args.vectorNorms ? &tVectorNorms : nullptr, + tQueries, + args.queriesRowMajor, + args.k, + args.metric, + args.metricArg, + tOutDistances, + tOutIntIndices, + args.ignoreOutDistances); + } // Convert and copy int indices out auto tOutIndices = toDeviceTemporary( res, @@ -146,23 +155,29 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { stream, {args.numQueries, args.k}); - // Since we've guaranteed that all arguments are on device, call the - // implementation - bfKnnOnDevice( - res, - device, - stream, - tVectors, - args.vectorsRowMajor, - args.vectorNorms ? &tVectorNorms : nullptr, - tQueries, - args.queriesRowMajor, - args.k, - args.metric, - args.metricArg, - tOutDistances, - tOutIntIndices, - args.ignoreOutDistances); +#if defined FAISS_ENABLE_RAFT + if (args.k <= 64) { + } else +#endif + { + // Since we've guaranteed that all arguments are on device, call the + // implementation + bfKnnOnDevice( + res, + device, + stream, + tVectors, + args.vectorsRowMajor, + args.vectorNorms ? &tVectorNorms : nullptr, + tQueries, + args.queriesRowMajor, + args.k, + args.metric, + args.metricArg, + tOutDistances, + tOutIntIndices, + args.ignoreOutDistances); + } // Copy back if necessary fromDevice(tOutIntIndices, (int*)args.outIndices, stream); diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 0f6e9bcf99..4ccef7bf67 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -71,8 +71,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { index->nprobe); nprobe = index->nprobe; - config.device = config_.device; - FAISS_ASSERT(metric_type != faiss::METRIC_L2 && metric_type != faiss::METRIC_INNER_PRODUCT); @@ -90,16 +88,51 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // Since we're trained, the quantizer must have data FAISS_ASSERT(index->quantizer->ntotal > 0); + +// // Copy our lists as well +// index_.reset(new IVFFlat( +// resources_.get(), +// quantizer->getGpuData(), // FlatIndex instance- contains the vectors in index +// index->metric_type, +// index->metric_arg, +// false, // no residual +// nullptr, // no scalar quantizer +// ivfFlatConfig_.interleavedLayout, +// ivfFlatConfig_.indicesOptions, +// config_.memorySpace)); +// +// // Copy all of the IVF data +// index_->copyInvertedListsFrom(index->invlists); // xcopy + + raft::spatial::knn::ivf_flat::index_params raft_idx_params; raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - // TODO: Invoke corresponding call on the RAFT side to copy quantizer + raft_knn_index.emplace(raft_handle, raft_idx_params, (uint32_t)d); + /** - * For example: - * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index( - * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); + * TODO: Copy centers and center norms from quantizer + * Things to do: + * 1. Copy index_->quantizer->vectors_ to raft_index->centers + * 2. Copy index_->quantizer->norms_ to raft_index->center_norms + */ + + raft::copy(raft_knn_index.value().centers(), + + + /** + * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists + * + * Things to do: + * 1. index->ivflists->data() is going to need to be translated over to our format + * (even the interleaved format is a little different) + * + * The GpuIndexIVFFlat has a function translateCodesToGpu_() for this + * + * 2. We will need to copy list_sizes, indices, and list_offsets */ + } void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { @@ -109,7 +142,8 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { if (raft_knn_index.has_value()) { DeviceScope scope(config_.device); - // TODO: We need to reserve memory on the raft::ivf_flat::index + // TODO: Need to figure out if this is absolutely necessary. + /** * For example: * raft::spatial::knn::ivf_flat::allocate_ivf_lists( @@ -119,6 +153,7 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { * raft_handle, *raft_knn_index, * n_centroids, centroids, * n_vectors, ivf); + * */ } } @@ -126,7 +161,7 @@ void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { size_t RaftIndexIVFFlat::reclaimMemory() { std::cout << "Reclaiming memory" << std::endl; - // TODO: We need to reclaim memory on the raft::ivf_flat::index + // TODO: Need to figure out if this is absolutely necessary /** * For example: * raft::spatial::knn::ivf_flat::reclaim_ivf_lists( @@ -153,23 +188,10 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - - // TODO: This should only train the quantizer portion of the index - /** - * For example: - * - * raft_knn_index.emplace(raft::spatial::knn::ivf_flat::make_index( - * raft_handle, raft_idx_params, (faiss::Index::idx_t)d); - - * raft::spatial::knn::ivf_flat::train_quantizer( - * raft_handle, *raft_knn_index, const_cast(x), n); - */ - raft_knn_index.emplace( raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, const_cast(x), - n, (faiss::Index::idx_t)d, - raft_handle.get_stream())); + n, (faiss::Index::idx_t)d)); raft_handle.sync_stream(); } @@ -218,6 +240,7 @@ std::vector RaftIndexIVFFlat::getListIndices(int listId) const { * raft::spatial::knn::ivf_flat::get_list_indices( * raft_handle, *raft_knn_index, listId); */ + Index::idx_t start_offset, stop_offset; std::vector vec; return vec; } @@ -230,26 +253,22 @@ void RaftIndexIVFFlat::addImpl_( FAISS_ASSERT(raft_knn_index.has_value()); FAISS_ASSERT(n > 0); - // Data is already resident on the GPU - Tensor data(const_cast(x), {n, (int)this->d}); - Tensor labels(const_cast(xids), {n}); - // // Not all vectors may be able to be added (some may contain NaNs etc) // index_->addVectors(data, labels); // // // but keep the ntotal based on the total number of vectors that we // // attempted to add - ntotal += n; std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; - // TODO: Invoke corresponding call in raft::ivf_flat /** * For example: * raft::spatial::knn::ivf_flat::add_vectors( * raft_handle, *raft_knn_index, n, x, xids); */ - + raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend( + raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n)); + this->ntotal += n; } void RaftIndexIVFFlat::searchImpl_( @@ -273,14 +292,15 @@ void RaftIndexIVFFlat::searchImpl_( raft::spatial::knn::ivf_flat::search_params raft_idx_params; raft_idx_params.n_probes = nprobe; - raft::spatial::knn::ivf_flat::search(raft_handle, + raft::spatial::knn::ivf_flat::search( + raft_handle, raft_idx_params, *raft_knn_index, const_cast(x), static_cast(n), static_cast(k), static_cast(labels), - distances, raft_handle.get_stream()); + distances); raft_handle.sync_stream(); } From 548e0f0cd251b92e12304d9d62164284b01bba22 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 2 Aug 2022 19:45:44 -0400 Subject: [PATCH 16/87] More updates --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 36 +++++------------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 71262d2e19..421b3910bb 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -76,28 +76,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // // Since we're trained, the quantizer must have data // FAISS_ASSERT(index->quantizer->ntotal > 0); // -// -//// // Copy our lists as well -//// index_.reset(new IVFFlat( -//// resources_.get(), -//// quantizer->getGpuData(), // FlatIndex instance- contains the vectors in index -//// index->metric_type, -//// index->metric_arg, -//// false, // no residual -//// nullptr, // no scalar quantizer -//// ivfFlatConfig_.interleavedLayout, -//// ivfFlatConfig_.indicesOptions, -//// config_.memorySpace)); -//// -//// // Copy all of the IVF data -//// index_->copyInvertedListsFrom(index->invlists); // xcopy -// -// -// raft::spatial::knn::ivf_flat::index_params raft_idx_params; -// raft_idx_params.n_lists = nlist; -// raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; -// -// raft_knn_index.emplace(raft_handle, raft_idx_params, (uint32_t)d); // /** * TODO: Copy centers and center norms from quantizer @@ -105,10 +83,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { * 1. Copy index_->quantizer->vectors_ to raft_index->centers * 2. Copy index_->quantizer->norms_ to raft_index->center_norms */ -// -// raft::copy(raft_knn_index.value().centers(), -// -// /** * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists * @@ -130,6 +104,8 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_ASSERT(index->nlist == this->nlist); Index::idx_t quantizer_ntotal = index->quantizer->ntotal; + Index::idx_t index_ntotal = index->ntotal; + std::cout << "Calling copyFrom with trained index with " << quantizer_ntotal << " items" << std::endl; auto stream = raft_handle.get_stream(); @@ -143,9 +119,9 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal); - if(index->ntotal > 0) { - std::cout << "Adding " << index->ntotal << " vectors to index" << std::endl; - total_elems = size_t(index->ntotal) * size_t(index->d); + if(index_ntotal > 0) { + std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl; + total_elems = size_t(index_ntotal) * size_t(index->d); buf_dev.resize(total_elems, stream); { std::vector buf_host(total_elems); @@ -153,7 +129,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); } - RaftIndexIVFFlat::addImpl_(index->ntotal, buf_dev.data(), nullptr); + RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr); } } else { // index is not trained, so we can remove ours as well (if there was From baa34d7c30b61f183b0beca24823147818c76add Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 2 Aug 2022 20:01:38 -0400 Subject: [PATCH 17/87] One test running so far. --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 1 + faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 288 ++++++++++++------------ 2 files changed, 150 insertions(+), 139 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 421b3910bb..bde0c7ef1e 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -193,6 +193,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { n, (faiss::Index::idx_t)d)); raft_handle.sync_stream(); + this->is_trained = true; } int RaftIndexIVFFlat::getListLength(int listId) const { diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index 9df27b2f3d..1a560201ad 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -97,7 +97,17 @@ void queryTest( faiss::gpu::RaftIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); +// gpuIndex.copyFrom(&cpuIndex); + + raft::handle_t raft_handle; + rmm::device_uvector trainVecsDev(trainVecs.size(), raft_handle.get_stream()); + raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream()); + + rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); + raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); + + gpuIndex.train(opt.numTrain, trainVecsDev.data()); + gpuIndex.add(opt.numAdd, addVecsDev.data()); gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; @@ -267,25 +277,25 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { compFloat16 ? 0.30f : 0.015f); } -TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { - addTest(faiss::METRIC_L2, false); - printf("Finished addTest(faiss::METRIC_L2, false)\n"); -} - -TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { - addTest(faiss::METRIC_INNER_PRODUCT, false); - printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -} - -TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { - addTest(faiss::METRIC_L2, true); - printf("Finished addTest(faiss::METRIC_L2, true)\n"); -} - -TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { - addTest(faiss::METRIC_INNER_PRODUCT, true); - printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -} +//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { +// addTest(faiss::METRIC_L2, false); +// printf("Finished addTest(faiss::METRIC_L2, false)\n"); +//} +// +//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { +// addTest(faiss::METRIC_INNER_PRODUCT, false); +// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); +//} +// +//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { +// addTest(faiss::METRIC_L2, true); +// printf("Finished addTest(faiss::METRIC_L2, true)\n"); +//} +// +//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { +// addTest(faiss::METRIC_INNER_PRODUCT, true); +// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); +//} // // General query tests @@ -348,71 +358,71 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { // printf("Finished copyToTest(false)\n"); // } -TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) { - copyFromTest(false); - printf("Finished copyFromTest(false)\n"); -} - -TEST(TestRaftIndexIVFFlat, Float32_negative) { - Options opt; - - auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); +//TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) { +// copyFromTest(false); +// printf("Finished copyFromTest(false)\n"); +//} - // Put all vecs on negative side - for (auto& f : trainVecs) { - f = std::abs(f) * -1.0f; - } - - for (auto& f : addVecs) { - f *= std::abs(f) * -1.0f; - } - - faiss::IndexFlatIP quantizerIP(opt.dim); - faiss::Index* quantizer = (faiss::Index*)&quantizerIP; - - faiss::IndexIVFFlat cpuIndex( - quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - cpuIndex.nprobe = opt.nprobe; - - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - - faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.setNumProbes(opt.nprobe); - - // Construct a positive test set - auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - - // Put all vecs on positive size - for (auto& f : queryVecs) { - f = std::abs(f); - } - - bool compFloat16 = false; - faiss::gpu::compareIndices( - queryVecs, - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.99f : 0.1f, - compFloat16 ? 0.65f : 0.015f); -} +//TEST(TestRaftIndexIVFFlat, Float32_negative) { +// Options opt; +// +// auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); +// auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); +// +// // Put all vecs on negative side +// for (auto& f : trainVecs) { +// f = std::abs(f) * -1.0f; +// } +// +// for (auto& f : addVecs) { +// f *= std::abs(f) * -1.0f; +// } +// +// faiss::IndexFlatIP quantizerIP(opt.dim); +// faiss::Index* quantizer = (faiss::Index*)&quantizerIP; +// +// faiss::IndexIVFFlat cpuIndex( +// quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); +// cpuIndex.train(opt.numTrain, trainVecs.data()); +// cpuIndex.add(opt.numAdd, addVecs.data()); +// cpuIndex.nprobe = opt.nprobe; +// +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); +// +// faiss::gpu::GpuIndexIVFFlatConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// +// faiss::gpu::RaftIndexIVFFlat gpuIndex( +// &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); +// gpuIndex.copyFrom(&cpuIndex); +// gpuIndex.setNumProbes(opt.nprobe); +// +// // Construct a positive test set +// auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); +// +// // Put all vecs on positive size +// for (auto& f : queryVecs) { +// f = std::abs(f); +// } +// +// bool compFloat16 = false; +// faiss::gpu::compareIndices( +// queryVecs, +// cpuIndex, +// gpuIndex, +// opt.numQuery, +// opt.dim, +// opt.k, +// opt.toString(), +// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, +// // FIXME: the fp16 bounds are +// // useless when math (the accumulator) is +// // in fp16. Figure out another way to test +// compFloat16 ? 0.99f : 0.1f, +// compFloat16 ? 0.65f : 0.015f); +//} // // NaN tests @@ -507,61 +517,61 @@ TEST(TestRaftIndexIVFFlat, Float32_negative) { // indices.data()); // } -TEST(TestRaftIndexIVFFlat, UnifiedMemory) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - if (!faiss::gpu::getFullUnifiedMemSupport(device)) { - return; - } - - int dim = 128; - - int numCentroids = 256; - // Unfortunately it would take forever to add 24 GB in IVFPQ data, - // so just perform a small test with data allocated in the unified - // memory address space - size_t numAdd = 10000; - size_t numTrain = numCentroids * 40; - int numQuery = 10; - int k = 10; - int nprobe = 8; - - std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); - std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); - - faiss::IndexFlatL2 quantizer(dim); - faiss::IndexIVFFlat cpuIndex( - &quantizer, dim, numCentroids, faiss::METRIC_L2); - - cpuIndex.train(numTrain, trainVecs.data()); - cpuIndex.add(numAdd, addVecs.data()); - cpuIndex.nprobe = nprobe; - - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = device; - config.memorySpace = faiss::gpu::MemorySpace::Unified; - - faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.setNumProbes(nprobe); - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); -} +//TEST(TestRaftIndexIVFFlat, UnifiedMemory) { +// // Construct on a random device to test multi-device, if we have +// // multiple devices +// int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); +// +// if (!faiss::gpu::getFullUnifiedMemSupport(device)) { +// return; +// } +// +// int dim = 128; +// +// int numCentroids = 256; +// // Unfortunately it would take forever to add 24 GB in IVFPQ data, +// // so just perform a small test with data allocated in the unified +// // memory address space +// size_t numAdd = 10000; +// size_t numTrain = numCentroids * 40; +// int numQuery = 10; +// int k = 10; +// int nprobe = 8; +// +// std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); +// std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); +// +// faiss::IndexFlatL2 quantizer(dim); +// faiss::IndexIVFFlat cpuIndex( +// &quantizer, dim, numCentroids, faiss::METRIC_L2); +// +// cpuIndex.train(numTrain, trainVecs.data()); +// cpuIndex.add(numAdd, addVecs.data()); +// cpuIndex.nprobe = nprobe; +// +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); +// +// faiss::gpu::GpuIndexIVFFlatConfig config; +// config.device = device; +// config.memorySpace = faiss::gpu::MemorySpace::Unified; +// +// faiss::gpu::RaftIndexIVFFlat gpuIndex( +// &res, dim, numCentroids, faiss::METRIC_L2, config); +// gpuIndex.copyFrom(&cpuIndex); +// gpuIndex.setNumProbes(nprobe); +// +// faiss::gpu::compareIndices( +// cpuIndex, +// gpuIndex, +// numQuery, +// dim, +// k, +// "Unified Memory", +// kF32MaxRelErr, +// 0.1f, +// 0.015f); +//} int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); From edc59916f9ee2570d01e5bc60e939885257e0f7c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 3 Aug 2022 10:25:29 -0400 Subject: [PATCH 18/87] Setting add_data_on_build = false; --- faiss/gpu/raft/RaftIndexIVFFlat.cu | 8 +++---- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 32 ++++++++++++------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index bde0c7ef1e..1e29f7f473 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -186,6 +186,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { raft::spatial::knn::ivf_flat::index_params raft_idx_params; raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; raft_knn_index.emplace( raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, @@ -278,11 +279,6 @@ void RaftIndexIVFFlat::addImpl_( std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; - /** - * For example: - * raft::spatial::knn::ivf_flat::add_vectors( - * raft_handle, *raft_knn_index, n, x, xids); - */ raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend( raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n)); this->ntotal += n; @@ -332,9 +328,11 @@ void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) { } pams.metric_arg = this->metric_arg; pams.kmeans_trainset_fraction = 1.0; + pams.add_data_on_build = false; raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build( this->raft_handle, pams, x, n_rows, uint32_t(this->d))); + this->raft_handle.sync_stream(); this->is_trained = true; this->ntotal = n_rows; diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index 1a560201ad..cefcf6654a 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -306,10 +306,10 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2) { printf("Finished queryTest(faiss::METRIC_L2, false);\n"); } -TEST(TestRaftIndexIVFFlat, Float32_Query_IP) { - queryTest(faiss::METRIC_INNER_PRODUCT, false); - printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -} +//TEST(TestRaftIndexIVFFlat, Float32_Query_IP) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); +//} // float16 coarse quantizer @@ -318,10 +318,10 @@ TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) { printf("Finished queryTest(faiss::METRIC_L2, true)\n"); } -TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) { - queryTest(faiss::METRIC_INNER_PRODUCT, true); - printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -} +//TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) { +// queryTest(faiss::METRIC_INNER_PRODUCT, true); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); +//} // // There are IVF list scanning specializations for 64-d and 128-d that we @@ -333,20 +333,20 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) { printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n"); } -TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) { - queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); - printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); -} +//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); +//} TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { queryTest(faiss::METRIC_L2, false, 128); printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n"); } -TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { - queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); - printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); -} +//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); +//} // // Copy tests From 10f89b45c2869dd4feb0c8083d7fb0a6f945c2c9 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 8 Aug 2022 13:31:52 -0400 Subject: [PATCH 19/87] Copying centroids directly and adding some prints for the test outputs --- faiss/gpu/GpuIndexIVF.h | 2 +- faiss/gpu/raft/RaftIndexIVFFlat.cu | 37 +++++++++++++++++++++++++----- faiss/gpu/test/TestUtils.cpp | 29 +++++++++++++++++++++++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index 834cb061ce..391d32f28f 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -97,7 +97,7 @@ class GpuIndexIVF : public GpuIndex { /// Exposing this like the CPU version for manipulation int nprobe; - /// Exposeing this like the CPU version for query + /// Exposing this like the CPU version for query GpuIndexFlat* quantizer; protected: diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 1e29f7f473..d99f883ff3 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -110,15 +111,39 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { auto stream = raft_handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d); - rmm::device_uvector buf_dev(total_elems, stream); - { - std::vector buf_host(total_elems); - index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); + + raft::spatial::knn::ivf_flat::index_params pams; + switch (this->metric_type) { + case faiss::METRIC_L2: + pams.metric = raft::distance::DistanceType::L2Expanded; + break; + case faiss::METRIC_INNER_PRODUCT: + pams.metric = raft::distance::DistanceType::InnerProduct; + break; + default: + FAISS_THROW_MSG("Metric is not supported."); } - RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal); + raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d); + + raft::copy(raft_knn_index.value().centers().data_handle(), + quantizer->getGpuData()->getVectorsRef().data(), + total_elems, + raft_handle.get_stream()); + + // TODO: Need to compute the norms, I guess +// raft::copy(raft_knn_index.value().center_norms().value().data_handle(), quantizer->getGpuData()->norms_, quantizer_ntotal, raft_handle.get_stream()); + +// { +// std::vector buf_host(total_elems); +// index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); +// raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); +// } + +// RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal); +// + rmm::device_uvector buf_dev(total_elems, stream); if(index_ntotal > 0) { std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl; total_elems = size_t(index_ntotal) * size_t(index->d); diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp index 29fbef0335..2290cbb628 100644 --- a/faiss/gpu/test/TestUtils.cpp +++ b/faiss/gpu/test/TestUtils.cpp @@ -104,6 +104,35 @@ void compareIndices( testDistance.data(), testIndices.data()); + int idx = 4; + + int start_idx = idx * numQuery; + int stop_idx = start_idx + k; + printf("ref inds: ["); + for(int i = start_idx; i < stop_idx; i++) { + printf("%d, ", int(refIndices[i])); + } + printf("]\n"); + + printf("test inds: ["); + for(int i = start_idx; i < stop_idx; i++) { + printf("%d, ", int(testIndices[i])); + } + printf("]\n"); + + printf("ref dists: ["); + for(int i = start_idx; i < stop_idx; i++) { + printf("%f, ", float(refDistance[i])); + } + printf("]\n"); + + printf("test dists: ["); + for(int i = start_idx; i < stop_idx; i++) { + printf("%f, ", float(testDistance[i])); + } + printf("]\n"); + + faiss::gpu::compareLists( refDistance.data(), refIndices.data(), From 7c690200f490b809051a4507ed64cebe7d3e5a8e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 8 Aug 2022 15:30:00 -0400 Subject: [PATCH 20/87] reconstructions seems to be reasonable --- faiss/gpu/GpuIndexIVF.cu | 2 + faiss/gpu/raft/RaftIndexIVFFlat.cu | 50 +++++++------------------ faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 14 ++++--- 3 files changed, 23 insertions(+), 43 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 2df20fe2e8..a3c3ec6f73 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -231,6 +231,8 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) { clus.train(n, x, *quantizer); quantizer->is_trained = true; + + FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index d99f883ff3..1b5feaafce 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -59,25 +59,6 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { (size_t)getMaxKSelection(), index->nprobe); -// FAISS_ASSERT(metric_type != faiss::METRIC_L2 && -// metric_type != faiss::METRIC_INNER_PRODUCT); -// -// if (!index->is_trained) { -// // copied in GpuIndex::copyFrom -// FAISS_ASSERT(!is_trained && ntotal == 0); -// return; - -// } -// -// // copied in GpuIndex::copyFrom -// // ntotal can exceed max int, but the number of vectors per inverted -// // list cannot exceed this. We check this in the subclasses. -// FAISS_ASSERT(is_trained && (ntotal == index->ntotal)); -// -// // Since we're trained, the quantizer must have data -// FAISS_ASSERT(index->quantizer->ntotal > 0); -// -// /** * TODO: Copy centers and center norms from quantizer * Things to do: @@ -113,6 +94,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d); raft::spatial::knn::ivf_flat::index_params pams; + switch (this->metric_type) { case faiss::METRIC_L2: pams.metric = raft::distance::DistanceType::L2Expanded; @@ -126,34 +108,27 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d); - raft::copy(raft_knn_index.value().centers().data_handle(), - quantizer->getGpuData()->getVectorsRef().data(), - total_elems, - raft_handle.get_stream()); - - - // TODO: Need to compute the norms, I guess -// raft::copy(raft_knn_index.value().center_norms().value().data_handle(), quantizer->getGpuData()->norms_, quantizer_ntotal, raft_handle.get_stream()); - -// { -// std::vector buf_host(total_elems); -// index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); -// raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); -// } - -// RaftIndexIVFFlat::rebuildRaftIndex(buf_dev.data(), quantizer_ntotal); -// + // Copy (reconstructed) centroids over, rather than re-training rmm::device_uvector buf_dev(total_elems, stream); + { + std::vector buf_host(total_elems); + index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); + raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); + } + + // Add (reconstructed) vectors to index if needed if(index_ntotal > 0) { std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl; total_elems = size_t(index_ntotal) * size_t(index->d); buf_dev.resize(total_elems, stream); { std::vector buf_host(total_elems); - index->reconstruct_n(0, index->ntotal, buf_host.data()); + index->reconstruct_n(0, index_ntotal, buf_host.data()); raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); } + // TODO: We might want to consider moving the centroid norm computation + // outside of the incremental add on the RAFT side. RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr); } } else { @@ -212,6 +187,7 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_n_iters = 100; raft_knn_index.emplace( raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index cefcf6654a..c56d59442f 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -84,6 +84,8 @@ void queryTest( faiss::IndexIVFFlat cpuIndex( quantizer, opt.dim, opt.numCentroids, metricType); cpuIndex.train(opt.numTrain, trainVecs.data()); + + std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; cpuIndex.add(opt.numAdd, addVecs.data()); cpuIndex.nprobe = opt.nprobe; @@ -97,17 +99,17 @@ void queryTest( faiss::gpu::RaftIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); -// gpuIndex.copyFrom(&cpuIndex); + gpuIndex.copyFrom(&cpuIndex); raft::handle_t raft_handle; - rmm::device_uvector trainVecsDev(trainVecs.size(), raft_handle.get_stream()); - raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream()); - +// rmm::device_uvector trainVecsDev(trainVecs.size(), raft_handle.get_stream()); +// raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream()); +// rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); - gpuIndex.train(opt.numTrain, trainVecsDev.data()); - gpuIndex.add(opt.numAdd, addVecsDev.data()); +// gpuIndex.train(opt.numTrain, trainVecsDev.data()); +// gpuIndex.add(opt.numAdd, addVecsDev.data()); gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; From 8be77467cca175cb9c684f44da664591c2c5e804 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 9 Aug 2022 17:19:02 -0400 Subject: [PATCH 21/87] iUpdates to tests to compare against brute force as ground truth --- faiss/gpu/GpuIndexIVFFlat.cu | 4 + faiss/gpu/impl/FlatIndex.cuh | 5 + faiss/gpu/raft/RaftIndexIVFFlat.cu | 2 + faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 151 ++++++++++++++++++------ 4 files changed, 127 insertions(+), 35 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 3b32b83f84..27fe6a8d43 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -5,6 +5,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include @@ -91,6 +93,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.indicesOptions, config_.memorySpace)); + raft::print_device_vector("faiss centers", quantizer->getGpuData()->vectors(), 50, std::cout); + // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); } diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh index d701f78416..21c348d477 100644 --- a/faiss/gpu/impl/FlatIndex.cuh +++ b/faiss/gpu/impl/FlatIndex.cuh @@ -60,6 +60,11 @@ class FlatIndex { int num, cudaStream_t stream); + + float *vectors() { + return vectors_.data(); + } + void query( Tensor& vecs, int k, diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 1b5feaafce..9bc8d64d8d 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -116,6 +116,8 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); } + raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); + // Add (reconstructed) vectors to index if needed if(index_ntotal > 0) { std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl; diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index c56d59442f..be9edec8db 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include #include #include @@ -63,6 +66,50 @@ struct Options { faiss::gpu::IndicesOptions indicesOpt; }; +template +void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { + index.train(opt.numTrain, trainVecs.data()); + index.add(opt.numAdd, addVecs.data()); + index.setNumProbes(opt.nprobe); +} + + +void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m, + std::vector &addVecs, std::vector &queryVecs) { + + + + faiss::gpu::RmmGpuResources gpu_res; + gpu_res.setDefaultStream(opt.device, raft_handle.get_stream()); + + rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); + raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); + + rmm::device_uvector queryVecsDev(queryVecs.size(), raft_handle.get_stream()); + raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream()); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.k = opt.k; + args.dims = opt.dim; + args.vectors = addVecs.data(); + args.vectorsRowMajor = true; + args.numVectors = opt.numAdd; + args.queries = queryVecs.data(); + args.queriesRowMajor = true; + args.numQueries = opt.numQuery; + args.outDistances = dists; + args.outIndices = inds; + args.outIndicesType = faiss::gpu::IndicesDataType::I64; + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); +} + void queryTest( faiss::MetricType metricType, bool useFloat16CoarseQuantizer, @@ -75,19 +122,9 @@ void queryTest( faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - faiss::IndexFlatL2 quantizerL2(opt.dim); - faiss::IndexFlatIP quantizerIP(opt.dim); - faiss::Index* quantizer = metricType == faiss::METRIC_L2 - ? (faiss::Index*)&quantizerL2 - : (faiss::Index*)&quantizerIP; - - faiss::IndexIVFFlat cpuIndex( - quantizer, opt.dim, opt.numCentroids, metricType); - cpuIndex.train(opt.numTrain, trainVecs.data()); + std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; - cpuIndex.add(opt.numAdd, addVecs.data()); - cpuIndex.nprobe = opt.nprobe; faiss::gpu::RmmGpuResources res; res.noTempMemory(); @@ -97,35 +134,79 @@ void queryTest( config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - + // TODO: Since we are modifying the centroids when adding new vectors, + // the neighbors are no longer going to match completely between CPU + // and the RAFT indexes. We will probably want to perform a bfknn as + // ground truth and then compare the recall for both the RAFT and FAISS + // indices. raft::handle_t raft_handle; -// rmm::device_uvector trainVecsDev(trainVecs.size(), raft_handle.get_stream()); -// raft::copy(trainVecsDev.data(), trainVecs.data(), trainVecs.size(), raft_handle.get_stream()); -// - rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); - raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); -// gpuIndex.train(opt.numTrain, trainVecsDev.data()); -// gpuIndex.add(opt.numAdd, addVecsDev.data()); - gpuIndex.setNumProbes(opt.nprobe); + faiss::gpu::RaftIndexIVFFlat raftIndex( + &res, opt.dim, opt.numCentroids, metricType, config); - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, metricType, config); + + std::cout << "Training raft index" << std::endl; + train_index(raft_handle, opt, raftIndex, trainVecs, addVecs); + + std::cout << "Training gpu index" << std::endl; + train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); + + std::cout << "Computing ground truth" << std::endl; + rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); + + std::cout << "Done." << std::endl; + raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); + raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); + + rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + raftIndex.search( opt.numQuery, - opt.dim, + queryVecs.data(), opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.65f : 0.015f); + raft_dists.data(), + raft_inds.data()); + + rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + gpu_dists.data(), + gpu_inds.data()); + + + // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. + + raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout); + raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout); + + raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); + raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); + +// +// bool compFloat16 = useFloat16CoarseQuantizer; +// faiss::gpu::compareIndices( +// cpuIndex, +// gpuIndex, +// opt.numQuery, +// opt.dim, +// opt.k, +// opt.toString(), +// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, +// // FIXME: the fp16 bounds are +// // useless when math (the accumulator) is +// // in fp16. Figure out another way to test +// compFloat16 ? 0.70f : 0.1f, +// compFloat16 ? 0.65f : 0.015f); } } From 933582a11d49139aa08896c77d1286bdb9aab6dc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 24 Aug 2022 18:04:54 -0400 Subject: [PATCH 22/87] Starting to look at resulting runtimes in raft ivf flat tests --- build.sh | 7 +++---- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/build.sh b/build.sh index 7ff0577e29..c446b9bc0c 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -BUILD_TYPE=Debug +BUILD_TYPE=Release RAFT_REPO_REL="../raft" RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`" @@ -29,13 +29,12 @@ cmake \ -DBUILD_SHARED_LIBS=OFF \ -DFAISS_ENABLE_RAFT=ON \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DCPM_raft_SOURCE="${RAFT_REPO_PATH}" \ -DFAISS_OPT_LEVEL=avx2 \ - -DCMAKE_CUDA_ARCHITECTURES="86" \ + -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -B build . -make -C build -j +make -C build -j12 diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index be9edec8db..b9f6bc56e2 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include @@ -25,13 +25,13 @@ constexpr float kF32MaxRelErr = 0.03f; struct Options { Options() { - numAdd = 2 * faiss::gpu::randVal(2000, 5000); + numAdd = 2 * faiss::gpu::randVal(20000, 50000); dim = faiss::gpu::randVal(64, 200); numCentroids = std::sqrt((float)numAdd / 2); numTrain = numCentroids * 40; nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); - numQuery = faiss::gpu::randVal(32, 100); + numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100); // Due to the approximate nature of the query and of floating point // differences between GPU and CPU, to stay within our error bounds, @@ -70,6 +70,7 @@ template void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { index.train(opt.numTrain, trainVecs.data()); index.add(opt.numAdd, addVecs.data()); +// index.train(opt.numTrain, trainVecs.data()); index.setNumProbes(opt.nprobe); } @@ -166,6 +167,7 @@ void queryTest( rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + uint32_t rstart = raft::curTimeMillis(); raftIndex.search( opt.numQuery, queryVecs.data(), @@ -173,9 +175,14 @@ void queryTest( raft_dists.data(), raft_inds.data()); + raft_handle.sync_stream(); + uint32_t rstop = raft::curTimeMillis(); + std::cout << "Raft time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; + rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + uint32_t gstart = raft::curTimeMillis(); gpuIndex.search( opt.numQuery, queryVecs.data(), @@ -183,6 +190,10 @@ void queryTest( gpu_dists.data(), gpu_inds.data()); + raft_handle.sync_stream(); + uint32_t gstop = raft::curTimeMillis(); + + std::cout << "FAISS time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. From d2a65417a6b4163a46e7421888aad697f0b840d1 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 24 Aug 2022 18:56:51 -0400 Subject: [PATCH 23/87] Adding timing info to raft test --- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index b9f6bc56e2..f0b6ac72d5 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -25,7 +25,7 @@ constexpr float kF32MaxRelErr = 0.03f; struct Options { Options() { - numAdd = 2 * faiss::gpu::randVal(20000, 50000); + numAdd = 2 * faiss::gpu::randVal(50000, 70000); dim = faiss::gpu::randVal(64, 200); numCentroids = std::sqrt((float)numAdd / 2); @@ -68,10 +68,20 @@ struct Options { template void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { + + uint32_t train_start = raft::curTimeMillis(); index.train(opt.numTrain, trainVecs.data()); + raft_handle.sync_stream(); + uint32_t train_stop = raft::curTimeMillis(); + + uint32_t add_start = raft::curTimeMillis(); index.add(opt.numAdd, addVecs.data()); + raft_handle.sync_stream(); + uint32_t add_stop = raft::curTimeMillis(); // index.train(opt.numTrain, trainVecs.data()); index.setNumProbes(opt.nprobe); + + std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl; } @@ -149,10 +159,18 @@ void queryTest( &res, opt.dim, opt.numCentroids, metricType, config); std::cout << "Training raft index" << std::endl; + uint32_t r_train_start = raft::curTimeMillis(); train_index(raft_handle, opt, raftIndex, trainVecs, addVecs); + raft_handle.sync_stream(); + uint32_t r_train_stop = raft::curTimeMillis(); + std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; std::cout << "Training gpu index" << std::endl; + uint32_t g_train_start = raft::curTimeMillis(); train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); + raft_handle.sync_stream(); + uint32_t g_train_stop = raft::curTimeMillis(); + std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; std::cout << "Computing ground truth" << std::endl; rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); @@ -177,7 +195,7 @@ void queryTest( raft_handle.sync_stream(); uint32_t rstop = raft::curTimeMillis(); - std::cout << "Raft time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; + std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); @@ -193,7 +211,7 @@ void queryTest( raft_handle.sync_stream(); uint32_t gstop = raft::curTimeMillis(); - std::cout << "FAISS time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; + std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. From 986407a23ed31b9695619d94fea968347b920955 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 25 Aug 2022 19:39:49 -0400 Subject: [PATCH 24/87] Updating for rapids-cmake updates and RAFT updates --- CMakeLists.txt | 40 ++++++++------- build.sh | 9 ++-- cmake/thirdparty/get_raft.cmake | 8 +-- faiss/gpu/raft/RaftIndexIVFFlat.cu | 17 ++++++- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 65 +++++++++++++------------ fetch_rapids.cmake | 17 +++++++ 6 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 fetch_rapids.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 750cba414e..52da03120e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,17 +6,16 @@ cmake_minimum_required(VERSION 3.17 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake - ${CMAKE_BINARY_DIR}/RAPIDS.cmake) -include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) -include(rapids-cmake) -include(rapids-cpm) -include(rapids-cuda) -include(rapids-export) -include(rapids-find) - -rapids_cuda_init_architectures(faiss) +# Valid values are "generic", "avx2". +option(FAISS_OPT_LEVEL "" "generic") +option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) +option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) +option(FAISS_ENABLE_PYTHON "Build Python extension." ON) +option(FAISS_ENABLE_C_API "Build C API." OFF) +if(FAISS_ENABLE_RAFT) + include(fetch_rapids.cmake) +endif() project(faiss VERSION 1.6.4 @@ -29,23 +28,22 @@ set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -# Valid values are "generic", "avx2". -option(FAISS_OPT_LEVEL "" "generic") -option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) -option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) -option(FAISS_ENABLE_PYTHON "Build Python extension." ON) -option(FAISS_ENABLE_C_API "Build C API." OFF) +if(FAISS_ENABLE_RAFT) + include(rapids-cmake) + include(rapids-cpm) + rapids_cpm_init() + include(rapids-cuda) + include(rapids-export) + include(rapids-find) + rapids_cuda_init_architectures(faiss) + include(cmake/thirdparty/get_raft.cmake) +endif() if(FAISS_ENABLE_GPU) set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) enable_language(CUDA) endif() -if(FAISS_ENABLE_RAFT) - rapids_cpm_init() - include(cmake/thirdparty/get_raft.cmake) -endif() - add_subdirectory(faiss) if(FAISS_ENABLE_GPU) diff --git a/build.sh b/build.sh index c446b9bc0c..25000112e5 100755 --- a/build.sh +++ b/build.sh @@ -2,7 +2,7 @@ BUILD_TYPE=Release -RAFT_REPO_REL="../raft" +RAFT_REPO_REL="/share/workspace/rapids_projects/raft" RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`" set -e @@ -22,19 +22,22 @@ if [ "$1" == "test-raft" ]; then exit 0 fi +mkdir -p build/ && cd build/ cmake \ -DFAISS_ENABLE_GPU=ON \ -DFAISS_ENABLE_PYTHON=OFF \ -DBUILD_TESTING=ON \ -DBUILD_SHARED_LIBS=OFF \ + -DCPM_raft_SOURCE=${RAFT_REPO_REL} \ -DFAISS_ENABLE_RAFT=ON \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DFAISS_OPT_LEVEL=avx2 \ + -DRAFT_NVTX=ON \ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -B build . + ../ -make -C build -j12 +cmake --build . -j12 diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 782b3d71dc..c16b4ad489 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -15,9 +15,9 @@ #============================================================================= -set(RAFT_VERSION "22.08") +set(RAFT_VERSION "22.10") set(RAFT_FORK "achirkin") -set(RAFT_PINNED_TAG "enh-knn-ivf-flat-hide-impl") +set(RAFT_PINNED_TAG "enh-knn-kmeans-more-gpu") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) @@ -29,8 +29,8 @@ function(find_and_configure_raft) #----------------------------------------------------- rapids_cpm_find(raft ${PKG_VERSION} GLOBAL_TARGETS raft::raft - BUILD_EXPORT_SET projname-exports - INSTALL_EXPORT_SET projname-exports + BUILD_EXPORT_SET faiss-exports + INSTALL_EXPORT_SET faiss-exports CPM_ARGS GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git GIT_TAG ${PKG_PINNED_TAG} diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index 9bc8d64d8d..f80a59357b 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -106,7 +107,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(raft_handle, pams.metric, this->nlist, this->d); + raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d); // Copy (reconstructed) centroids over, rather than re-training rmm::device_uvector buf_dev(total_elems, stream); @@ -179,7 +180,13 @@ size_t RaftIndexIVFFlat::reclaimMemory() { void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { DeviceScope scope(config_.device); + + raft::common::nvtx::range fun_scope( + "RaftIndexIVFFlat::train (%ld)", n); + std::cout << "Calling train() with " << n << " rows" << std::endl; + + uint32_t start = raft::curTimeMillis(); if (this->is_trained) { FAISS_ASSERT(raft_knn_index.has_value()); return; @@ -197,6 +204,9 @@ void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { n, (faiss::Index::idx_t)d)); raft_handle.sync_stream(); + uint32_t stop = raft::curTimeMillis(); + + std::cout << "train took " << (stop - start) << "ms. " << std::endl; this->is_trained = true; } @@ -293,12 +303,15 @@ void RaftIndexIVFFlat::searchImpl_( int k, float* distances, Index::idx_t* labels) const { + + raft::common::nvtx::range fun_scope( + "RaftIndexIVFFlat::searchImpl_ (%ld)", n); + // Device is already set in GpuIndex::search FAISS_ASSERT(raft_knn_index.has_value()); FAISS_ASSERT(n > 0); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); - std::cout << "Calling searchImpl_ with " << n << " rows" << std::endl; raft::spatial::knn::ivf_flat::search_params pams; pams.n_probes = nprobe; raft::spatial::knn::ivf_flat::search( diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index f0b6ac72d5..d56246860f 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -29,7 +30,7 @@ struct Options { dim = faiss::gpu::randVal(64, 200); numCentroids = std::sqrt((float)numAdd / 2); - numTrain = numCentroids * 40; + numTrain = numCentroids * 50; nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100); @@ -155,8 +156,8 @@ void queryTest( faiss::gpu::RaftIndexIVFFlat raftIndex( &res, opt.dim, opt.numCentroids, metricType, config); - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, metricType, config); +// faiss::gpu::GpuIndexIVFFlat gpuIndex( +// &res, opt.dim, opt.numCentroids, metricType, config); std::cout << "Training raft index" << std::endl; uint32_t r_train_start = raft::curTimeMillis(); @@ -165,22 +166,22 @@ void queryTest( uint32_t r_train_stop = raft::curTimeMillis(); std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; - std::cout << "Training gpu index" << std::endl; - uint32_t g_train_start = raft::curTimeMillis(); - train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); - raft_handle.sync_stream(); - uint32_t g_train_stop = raft::curTimeMillis(); - std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; - - std::cout << "Computing ground truth" << std::endl; - rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); +// std::cout << "Training gpu index" << std::endl; +// uint32_t g_train_start = raft::curTimeMillis(); +// train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); +// raft_handle.sync_stream(); +// uint32_t g_train_stop = raft::curTimeMillis(); +// std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; - std::cout << "Done." << std::endl; - raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); - raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); +// std::cout << "Computing ground truth" << std::endl; +// rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); +// rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); +// +// invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); +// +// std::cout << "Done." << std::endl; +// raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); +// raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); @@ -200,26 +201,26 @@ void queryTest( rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - uint32_t gstart = raft::curTimeMillis(); - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - gpu_dists.data(), - gpu_inds.data()); - - raft_handle.sync_stream(); - uint32_t gstop = raft::curTimeMillis(); - - std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; +// uint32_t gstart = raft::curTimeMillis(); +// gpuIndex.search( +// opt.numQuery, +// queryVecs.data(), +// opt.k, +// gpu_dists.data(), +// gpu_inds.data()); +// +// raft_handle.sync_stream(); +// uint32_t gstop = raft::curTimeMillis(); +// +// std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout); raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout); - raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); - raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); +// raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); +// raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); // // bool compFloat16 = useFloat16CoarseQuantizer; diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake new file mode 100644 index 0000000000..0589dc9ddb --- /dev/null +++ b/fetch_rapids.cmake @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake + ) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) From ae4ed98c275300c3f8d46046f9ba0c81d05e704c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 10 Oct 2022 13:59:27 -0400 Subject: [PATCH 25/87] Adding RaftIndexIVFPQ --- CMakeLists.txt | 48 +-- build.sh | 2 +- cmake/thirdparty/get_raft.cmake | 4 +- faiss/gpu/CMakeLists.txt | 4 +- faiss/gpu/GpuDistance.cu | 7 +- faiss/gpu/raft/RaftIndexIVFFlat.cu | 5 +- faiss/gpu/raft/RaftIndexIVFFlat.h | 3 - faiss/gpu/raft/RaftIndexIVFPQ.cu | 396 ++++++++++++++++++++++++ faiss/gpu/raft/RaftIndexIVFPQ.h | 152 +++++++++ faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 68 ++-- 10 files changed, 626 insertions(+), 63 deletions(-) create mode 100644 faiss/gpu/raft/RaftIndexIVFPQ.cu create mode 100644 faiss/gpu/raft/RaftIndexIVFPQ.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 52da03120e..d5ab7c6421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,44 +6,46 @@ cmake_minimum_required(VERSION 3.17 FATAL_ERROR) -# Valid values are "generic", "avx2". -option(FAISS_OPT_LEVEL "" "generic") -option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) -option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) -option(FAISS_ENABLE_PYTHON "Build Python extension." ON) -option(FAISS_ENABLE_C_API "Build C API." OFF) +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake + ${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) +include(rapids-cmake) +include(rapids-cpm) +include(rapids-cuda) +include(rapids-export) +include(rapids-find) + +rapids_cuda_init_architectures(faiss) -if(FAISS_ENABLE_RAFT) - include(fetch_rapids.cmake) -endif() project(faiss - VERSION 1.6.4 - DESCRIPTION "A library for efficient similarity search and clustering of dense vectors." - HOMEPAGE_URL "https://github.com/facebookresearch/faiss" - LANGUAGES CXX) + VERSION 1.6.4 + DESCRIPTION "A library for efficient similarity search and clustering of dense vectors." + HOMEPAGE_URL "https://github.com/facebookresearch/faiss" + LANGUAGES CXX) include(GNUInstallDirs) set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -if(FAISS_ENABLE_RAFT) - include(rapids-cmake) - include(rapids-cpm) - rapids_cpm_init() - include(rapids-cuda) - include(rapids-export) - include(rapids-find) - rapids_cuda_init_architectures(faiss) - include(cmake/thirdparty/get_raft.cmake) -endif() +# Valid values are "generic", "avx2". +option(FAISS_OPT_LEVEL "" "generic") +option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) +option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) +option(FAISS_ENABLE_PYTHON "Build Python extension." ON) +option(FAISS_ENABLE_C_API "Build C API." OFF) if(FAISS_ENABLE_GPU) set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) enable_language(CUDA) endif() +if(FAISS_ENABLE_RAFT) + rapids_cpm_init() + include(cmake/thirdparty/get_raft.cmake) +endif() + add_subdirectory(faiss) if(FAISS_ENABLE_GPU) diff --git a/build.sh b/build.sh index 25000112e5..a37468d665 100755 --- a/build.sh +++ b/build.sh @@ -32,7 +32,7 @@ cmake \ -DFAISS_ENABLE_RAFT=ON \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DFAISS_OPT_LEVEL=avx2 \ - -DRAFT_NVTX=ON \ + -DRAFT_NVTX=OFF \ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index c16b4ad489..5a06fa1ae7 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -16,8 +16,8 @@ set(RAFT_VERSION "22.10") -set(RAFT_FORK "achirkin") -set(RAFT_PINNED_TAG "enh-knn-kmeans-more-gpu") +set(RAFT_FORK "rapidsai") +set(RAFT_PINNED_TAG "branch-22.10") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index f157e6e7ec..4cd8160f17 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -163,8 +163,8 @@ set(FAISS_GPU_HEADERS ) if(FAISS_ENABLE_RAFT) - list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h) - list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu) + list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIndexIVFPQ.h) + list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIndexIVFPQ.cu) endif() # Export FAISS_GPU_HEADERS variable to parent scope. diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index f671b9e5ec..ba1056f04a 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -14,6 +14,11 @@ #include #include +#ifdef FAISS_ENABLE_RAFT +// TODO: Expose fused_l2_knn +#include +#endif + namespace faiss { namespace gpu { @@ -103,7 +108,7 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { // Since we've guaranteed that all arguments are on device, call the // implementation -#if defined FAISS_ENABLE_RAFT +#ifdef FAISS_ENABLE_RAFT // TODO: When k <= 64, invoke bfknn from RAFT if (args.k <= 64) { diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index f80a59357b..fe77aa1d1e 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -40,7 +40,10 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( faiss::MetricType metric, GpuIndexIVFFlatConfig config) : GpuIndexIVFFlat(provider, dims, nlist, metric, config), - raft_handle(resources_->getDefaultStream(config_.device)) {} + raft_handle(resources_->getDefaultStream(config_.device)) { + + std::cout << "In raft index constructor" << std::endl; +} RaftIndexIVFFlat::~RaftIndexIVFFlat() { RaftIndexIVFFlat::reset(); diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index cd97f426df..d9b6e498ad 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -22,9 +22,6 @@ struct IndexIVFFlat; namespace faiss { namespace gpu { -class RaftIVFFlat; -class GpuIndexFlat; - /// Wrapper around the GPU implementation that looks like /// faiss::gpu::GpuIndexIVFFlat class RaftIndexIVFFlat : public GpuIndexIVFFlat { diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/raft/RaftIndexIVFPQ.cu new file mode 100644 index 0000000000..8620ec8e1f --- /dev/null +++ b/faiss/gpu/raft/RaftIndexIVFPQ.cu @@ -0,0 +1,396 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace faiss { +namespace gpu { +/** + * GpuIndexIVFPQ( + GpuResourcesProvider* provider, + int dims, + int nlist, + int subQuantizers, + int bitsPerCode, + faiss::MetricType metric, + GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()) + * @param provider + * @param index + * @param config + */ +RaftIndexIVFPQ::RaftIndexIVFPQ( + GpuResourcesProvider* provider, + const faiss::IndexIVFPQ* index, + GpuIndexIVFPQConfig config) + : GpuIndexIVFPQ(provider, index, config), + pq(index->pq), + ivfpqConfig_(config), + usePrecomputedTables_(config.usePrecomputedTables), + subQuantizers_(0), + bitsPerCode_(0), + reserveMemoryVecs_(0) { + copyFrom(index); +} + +RaftIndexIVFPQ::RaftIndexIVFPQ( + GpuResourcesProvider* provider, + int dims, + int nlist, + int subQuantizers, + int bitsPerCode, + faiss::MetricType metric, + GpuIndexIVFPQConfig config) + : GpuIndexIVFPQ(provider, dims, nlist, subQuantizers, bitsPerCode, metric, config), + pq(dims, subQuantizers, bitsPerCode), + ivfpqConfig_(config), + usePrecomputedTables_(config.usePrecomputedTables), + subQuantizers_(subQuantizers), + bitsPerCode_(bitsPerCode), + reserveMemoryVecs_(0) { + verifySettings_(); + + // We haven't trained ourselves, so don't construct the PQ index yet + this->is_trained = false; +} + +RaftIndexIVFPQ::~RaftIndexIVFPQ() {} + +void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { +// DeviceScope scope(config_.device); +// +// GpuIndexIVF::copyFrom(index); +// +// // Clear out our old data +// index_.reset(); +// +// pq = index->pq; +// subQuantizers_ = index->pq.M; +// bitsPerCode_ = index->pq.nbits; +// +// // We only support this +// FAISS_THROW_IF_NOT_MSG( +// ivfpqConfig_.interleavedLayout || index->pq.nbits == 8, +// "GPU: only pq.nbits == 8 is supported"); +// FAISS_THROW_IF_NOT_MSG( +// index->by_residual, "GPU: only by_residual = true is supported"); +// FAISS_THROW_IF_NOT_MSG( +// index->polysemous_ht == 0, "GPU: polysemous codes not supported"); +// +// verifySettings_(); +// +// // The other index might not be trained +// if (!index->is_trained) { +// // copied in GpuIndex::copyFrom +// FAISS_ASSERT(!is_trained); +// return; +// } +// +// // Copy our lists as well +// // The product quantizer must have data in it +// FAISS_ASSERT(index->pq.centroids.size() > 0); +// index_.reset(new IVFPQ( +// resources_.get(), +// index->metric_type, +// index->metric_arg, +// quantizer->getGpuData(), +// subQuantizers_, +// bitsPerCode_, +// ivfpqConfig_.useFloat16LookupTables, +// ivfpqConfig_.useMMCodeDistance, +// ivfpqConfig_.interleavedLayout, +// (float*)index->pq.centroids.data(), +// ivfpqConfig_.indicesOptions, +// config_.memorySpace)); +// // Doesn't make sense to reserve memory here +// index_->setPrecomputedCodes(usePrecomputedTables_); +// +// // Copy all of the IVF data +// index_->copyInvertedListsFrom(index->invlists); +} + +void RaftIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const { +// DeviceScope scope(config_.device); +// +// // We must have the indices in order to copy to ourselves +// FAISS_THROW_IF_NOT_MSG( +// ivfpqConfig_.indicesOptions != INDICES_IVF, +// "Cannot copy to CPU as GPU index doesn't retain " +// "indices (INDICES_IVF)"); +// +// GpuIndexIVF::copyTo(index); +// +// // +// // IndexIVFPQ information +// // +// index->by_residual = true; +// index->use_precomputed_table = 0; +// index->code_size = subQuantizers_; +// index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_); +// +// index->do_polysemous_training = false; +// index->polysemous_training = nullptr; +// +// index->scan_table_threshold = 0; +// index->max_codes = 0; +// index->polysemous_ht = 0; +// index->precomputed_table.clear(); +// +// auto ivf = new ArrayInvertedLists(nlist, index->code_size); +// index->replace_invlists(ivf, true); +// +// if (index_) { +// // Copy IVF lists +// index_->copyInvertedListsTo(ivf); +// +// // Copy PQ centroids +// auto devPQCentroids = index_->getPQCentroids(); +// index->pq.centroids.resize(devPQCentroids.numElements()); +// +// fromDevice( +// devPQCentroids, +// index->pq.centroids.data(), +// resources_->getDefaultStream(config_.device)); +// +// if (usePrecomputedTables_) { +// index->precompute_table(); +// } +// } +} + +void RaftIndexIVFPQ::reserveMemory(size_t numVecs) { + reserveMemoryVecs_ = numVecs; + if (index_) { + DeviceScope scope(config_.device); + index_->reserveMemory(numVecs); + } +} + +void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) { + usePrecomputedTables_ = enable; + if (index_) { + DeviceScope scope(config_.device); + index_->setPrecomputedCodes(enable); + } + + verifySettings_(); +} + +bool RaftIndexIVFPQ::getPrecomputedCodes() const { + return usePrecomputedTables_; +} + +int RaftIndexIVFPQ::getNumSubQuantizers() const { + return subQuantizers_; +} + +int RaftIndexIVFPQ::getBitsPerCode() const { + return bitsPerCode_; +} + +int RaftIndexIVFPQ::getCentroidsPerSubQuantizer() const { + return utils::pow2(bitsPerCode_); +} + +size_t RaftIndexIVFPQ::reclaimMemory() { + if (index_) { + DeviceScope scope(config_.device); + return index_->reclaimMemory(); + } + + return 0; +} + +void RaftIndexIVFPQ::reset() { + if (raft_knn_index.has_value()) { + raft_knn_index.reset(); + this->ntotal = 0; + } else { + FAISS_ASSERT(this->ntotal == 0); + } +} + +void RaftIndexIVFPQ::train(Index::idx_t n, const float* x) { + raft::common::nvtx::range fun_scope( + "RaftIndexIVFFlat::train (%ld)", n); + + std::cout << "Calling train() with " << n << " rows" << std::endl; + + uint32_t start = raft::curTimeMillis(); + if (this->is_trained) { + FAISS_ASSERT(raft_knn_index.has_value()); + return; + } + + raft::spatial::knn::ivf_pq::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_n_iters = 100; + + raft_knn_index.emplace( + raft::spatial::knn::ivf_pq::build(raft_handle, raft_idx_params, + const_cast(x), + n, (faiss::Index::idx_t)d)); + + raft_handle.sync_stream(); + uint32_t stop = raft::curTimeMillis(); + + std::cout << "train took " << (stop - start) << "ms. " << std::endl; + this->is_trained = true; +} + +void RaftIndexIVFPQ::addImpl_(int n, const float* x, const Index::idx_t* xids) { + // Device is already set in GpuIndex::add + FAISS_ASSERT(is_trained); + FAISS_ASSERT(n > 0); + + // but keep the ntotal based on the total number of vectors that we + // attempted to add + std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; + + raft_knn_index.emplace(raft::spatial::knn::ivf_pq::extend( + raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n)); + this->ntotal += n; + + ntotal += n; +} + +void RaftIndexIVFPQ::searchImpl_( + int n, + const float* x, + int k, + float* distances, + Index::idx_t* labels) const { + // Device is already set in GpuIndex::search + FAISS_ASSERT(index_); + FAISS_ASSERT(n > 0); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); + + raft::common::nvtx::range fun_scope( + "RaftIndexIVFFlat::searchImpl_ (%ld)", n); + + // Device is already set in GpuIndex::search + FAISS_ASSERT(raft_knn_index.has_value()); + FAISS_ASSERT(n > 0); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); + + raft::spatial::knn::ivf_pq::search_params pams; + pams.n_probes = nprobe; + raft::spatial::knn::ivf_pq::search( + raft_handle, + pams, + *raft_knn_index, + const_cast(x), + static_cast(n), + static_cast(k), + labels, + distances); + + raft_handle.sync_stream(); +} + +int RaftIndexIVFPQ::getListLength(int listId) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListLength(listId); +} + +std::vector RaftIndexIVFPQ::getListVectorData( + int listId, + bool gpuFormat) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListVectorData(listId, gpuFormat); +} + +std::vector RaftIndexIVFPQ::getListIndices(int listId) const { + FAISS_ASSERT(index_); + DeviceScope scope(config_.device); + + return index_->getListIndices(listId); +} + +void RaftIndexIVFPQ::verifySettings_() const { + // Our implementation has these restrictions: + + // Must have some number of lists + FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0"); + + // up to a single byte per code + if (ivfpqConfig_.interleavedLayout) { + FAISS_THROW_IF_NOT_FMT( + bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 || + bitsPerCode_ == 8, + "Bits per code must be between 4, 5, 6 or 8 (passed %d)", + bitsPerCode_); + + } else { + FAISS_THROW_IF_NOT_FMT( + bitsPerCode_ == 8, + "Bits per code must be 8 (passed %d)", + bitsPerCode_); + } + + // Sub-quantizers must evenly divide dimensions available + FAISS_THROW_IF_NOT_FMT( + this->d % subQuantizers_ == 0, + "Number of sub-quantizers (%d) must be an " + "even divisor of the number of dimensions (%d)", + subQuantizers_, + this->d); + + // The number of bytes per encoded vector must be one we support + FAISS_THROW_IF_NOT_FMT( + ivfpqConfig_.interleavedLayout || + IVFPQ::isSupportedPQCodeLength(subQuantizers_), + "Number of bytes per encoded vector / sub-quantizers (%d) " + "is not supported", + subQuantizers_); + + // We must have enough shared memory on the current device to store + // our lookup distances + int lookupTableSize = sizeof(float); + if (ivfpqConfig_.useFloat16LookupTables) { + lookupTableSize = sizeof(half); + } + + // 64 bytes per code is only supported with usage of float16, at 2^8 + // codes per subquantizer + size_t requiredSmemSize = + lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_); + size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device); + + FAISS_THROW_IF_NOT_FMT( + requiredSmemSize <= getMaxSharedMemPerBlock(config_.device), + "Device %d has %zu bytes of shared memory, while " + "%d bits per code and %d sub-quantizers requires %zu " + "bytes. Consider useFloat16LookupTables and/or " + "reduce parameters", + config_.device, + smemPerBlock, + bitsPerCode_, + subQuantizers_, + requiredSmemSize); +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/raft/RaftIndexIVFPQ.h new file mode 100644 index 0000000000..a121681c2c --- /dev/null +++ b/faiss/gpu/raft/RaftIndexIVFPQ.h @@ -0,0 +1,152 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +namespace faiss { +struct IndexIVFPQ; +} + +namespace faiss { +namespace gpu { + +class GpuIndexFlat; +class IVFPQ; + +/// RAFT IVFPQ index for the GPU +class RaftIndexIVFPQ : public GpuIndexIVFPQ { + public: + /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying + /// data over to the given GPU, if the input index is trained. + RaftIndexIVFPQ( + GpuResourcesProvider* provider, + const faiss::IndexIVFPQ* index, + GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); + + /// Construct an empty index + RaftIndexIVFPQ( + GpuResourcesProvider* provider, + int dims, + int nlist, + int subQuantizers, + int bitsPerCode, + faiss::MetricType metric, + GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); + + ~RaftIndexIVFPQ() override; + + /// Reserve space on the GPU for the inverted lists for `num` + /// vectors, assumed equally distributed among + + /// Initialize ourselves from the given CPU index; will overwrite + /// all data in ourselves + void copyFrom(const faiss::IndexIVFPQ* index); + + /// Copy ourselves to the given CPU index; will overwrite all data + /// in the index instance + void copyTo(faiss::IndexIVFPQ* index) const; + + /// Reserve GPU memory in our inverted lists for this number of vectors + void reserveMemory(size_t numVecs); + + /// Enable or disable pre-computed codes + void setPrecomputedCodes(bool enable); + + /// Are pre-computed codes enabled? + bool getPrecomputedCodes() const; + + /// Return the number of sub-quantizers we are using + int getNumSubQuantizers() const; + + /// Return the number of bits per PQ code + int getBitsPerCode() const; + + /// Return the number of centroids per PQ code (2^bits per code) + int getCentroidsPerSubQuantizer() const; + + /// After adding vectors, one can call this to reclaim device memory + /// to exactly the amount needed. Returns space reclaimed in bytes + size_t reclaimMemory(); + + /// Clears out all inverted lists, but retains the coarse and + /// product centroid information + void reset() override; + + /// Trains the coarse and product quantizer based on the given vector data + void train(Index::idx_t n, const float* x) override; + + /// Returns the number of vectors present in a particular inverted list + int getListLength(int listId) const override; + + /// Return the encoded vector data contained in a particular inverted list, + /// for debugging purposes. + /// If gpuFormat is true, the data is returned as it is encoded in the + /// GPU-side representation. + /// Otherwise, it is converted to the CPU format. + /// compliant format, while the native GPU format may differ. + std::vector getListVectorData(int listId, bool gpuFormat = false) + const override; + + /// Return the vector indices contained in a particular inverted list, for + /// debugging purposes. + std::vector getListIndices(int listId) const override; + + public: + /// Like the CPU version, we expose a publically-visible ProductQuantizer + /// for manipulation + ProductQuantizer pq; + + protected: + /// Called from GpuIndex for add/add_with_ids + void addImpl_(int n, const float* x, const Index::idx_t* ids) override; + + /// Called from GpuIndex for search + void searchImpl_( + int n, + const float* x, + int k, + float* distances, + Index::idx_t* labels) const override; + + /// Throws errors if configuration settings are improper + void verifySettings_() const; + + protected: + /// Our configuration options that we were initialized with + const GpuIndexIVFPQConfig ivfpqConfig_; + + /// Runtime override: whether or not we use precomputed tables + bool usePrecomputedTables_; + + /// Number of sub-quantizers per encoded vector + int subQuantizers_; + + /// Bits per sub-quantizer code + int bitsPerCode_; + + /// Desired inverted list memory reservation + size_t reserveMemoryVecs_; + + /// The product quantizer instance that we own; contains the + /// inverted lists + std::unique_ptr index_; + + const raft::handle_t raft_handle; + std::optional> raft_knn_index{std::nullopt}; +}; + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index d56246860f..8d784c0593 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -138,6 +138,7 @@ void queryTest( std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; + printf("Creating rmm resources\n"); faiss::gpu::RmmGpuResources res; res.noTempMemory(); @@ -151,13 +152,20 @@ void queryTest( // and the RAFT indexes. We will probably want to perform a bfknn as // ground truth and then compare the recall for both the RAFT and FAISS // indices. - raft::handle_t raft_handle; + printf("Building raft index\n"); faiss::gpu::RaftIndexIVFFlat raftIndex( &res, opt.dim, opt.numCentroids, metricType, config); -// faiss::gpu::GpuIndexIVFFlat gpuIndex( -// &res, opt.dim, opt.numCentroids, metricType, config); + printf("Done.\n"); + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, metricType, config); + + + printf("Creating raft handle\n"); + raft::handle_t raft_handle; + printf("Done\n"); std::cout << "Training raft index" << std::endl; uint32_t r_train_start = raft::curTimeMillis(); @@ -166,22 +174,22 @@ void queryTest( uint32_t r_train_stop = raft::curTimeMillis(); std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; -// std::cout << "Training gpu index" << std::endl; -// uint32_t g_train_start = raft::curTimeMillis(); -// train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); -// raft_handle.sync_stream(); -// uint32_t g_train_stop = raft::curTimeMillis(); -// std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; + std::cout << "Training gpu index" << std::endl; + uint32_t g_train_start = raft::curTimeMillis(); + train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); + raft_handle.sync_stream(); + uint32_t g_train_stop = raft::curTimeMillis(); + std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; -// std::cout << "Computing ground truth" << std::endl; -// rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); -// rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); -// -// invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); -// -// std::cout << "Done." << std::endl; -// raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); -// raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); + std::cout << "Computing ground truth" << std::endl; + rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); + + std::cout << "Done." << std::endl; + raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); + raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); @@ -201,18 +209,18 @@ void queryTest( rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); -// uint32_t gstart = raft::curTimeMillis(); -// gpuIndex.search( -// opt.numQuery, -// queryVecs.data(), -// opt.k, -// gpu_dists.data(), -// gpu_inds.data()); -// -// raft_handle.sync_stream(); -// uint32_t gstop = raft::curTimeMillis(); -// -// std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; + uint32_t gstart = raft::curTimeMillis(); + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + gpu_dists.data(), + gpu_inds.data()); + + raft_handle.sync_stream(); + uint32_t gstop = raft::curTimeMillis(); + + std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. From 410b2c654e19a2ade5d314c35cadd7c108a85336 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 10 Oct 2022 16:34:18 -0400 Subject: [PATCH 26/87] Updates --- CMakeLists.txt | 9 +- .../thirdparty/fetch_rapids.cmake | 12 +- cmake/thirdparty/get_raft.cmake | 4 +- faiss/gpu/test/TestRaftIndexIVFPQ.cpp | 704 ++++++++++++++++++ 4 files changed, 718 insertions(+), 11 deletions(-) rename fetch_rapids.cmake => cmake/thirdparty/fetch_rapids.cmake (69%) create mode 100644 faiss/gpu/test/TestRaftIndexIVFPQ.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7776821c7a..ded2d8635a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,11 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -cmake_minimum_required(VERSION 3.17 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake - ${CMAKE_BINARY_DIR}/RAPIDS.cmake) -include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) +if(FAISS_ENABLE_RAFT) +include(cmake/thirdparty/fetch_rapids.cmake) include(rapids-cmake) include(rapids-cpm) include(rapids-cuda) @@ -16,7 +15,7 @@ include(rapids-export) include(rapids-find) rapids_cuda_init_architectures(faiss) - +endif() project(faiss VERSION 1.7.2 diff --git a/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake similarity index 69% rename from fetch_rapids.cmake rename to cmake/thirdparty/fetch_rapids.cmake index 0589dc9ddb..0befc2fd5d 100644 --- a/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -11,7 +11,11 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake - ${CMAKE_BINARY_DIR}/RAPIDS.cmake - ) -include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) + +set(RAPIDS_VERSION "22.10") + +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake + ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) +endif() +include(${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 5a06fa1ae7..91f53b0f4d 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -15,9 +15,9 @@ #============================================================================= -set(RAFT_VERSION "22.10") +set(RAFT_VERSION "${RAPIDS_VERSION}") set(RAFT_FORK "rapidsai") -set(RAFT_PINNED_TAG "branch-22.10") +set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) diff --git a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp new file mode 100644 index 0000000000..61a3c8870e --- /dev/null +++ b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp @@ -0,0 +1,704 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// FIXME: figure out a better way to test fp16 +constexpr float kF16MaxRelErr = 0.3f; +constexpr float kF32MaxRelErr = 0.03f; + +struct Options { + Options() { + numAdd = 2 * faiss::gpu::randVal(50000, 70000); + dim = faiss::gpu::randVal(64, 200); + + numCentroids = std::sqrt((float)numAdd / 2); + numTrain = numCentroids * 50; + nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); + numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100); + + // Due to the approximate nature of the query and of floating point + // differences between GPU and CPU, to stay within our error bounds, + // only use a small k + k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40); + indicesOpt = faiss::gpu::randSelect( + {faiss::gpu::INDICES_CPU, + faiss::gpu::INDICES_32_BIT, + faiss::gpu::INDICES_64_BIT}); + + device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + } + + std::string toString() const { + std::stringstream str; + str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " + << dim << " numCentroids " << numCentroids << " nprobe " << nprobe + << " numQuery " << numQuery << " k " << k << " indicesOpt " + << indicesOpt; + + return str.str(); + } + + int numAdd; + int dim; + int numCentroids; + int numTrain; + int nprobe; + int numQuery; + int k; + int device; + faiss::gpu::IndicesOptions indicesOpt; +}; + +template +void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { + + uint32_t train_start = raft::curTimeMillis(); + index.train(opt.numTrain, trainVecs.data()); + raft_handle.sync_stream(); + uint32_t train_stop = raft::curTimeMillis(); + + uint32_t add_start = raft::curTimeMillis(); + index.add(opt.numAdd, addVecs.data()); + raft_handle.sync_stream(); + uint32_t add_stop = raft::curTimeMillis(); +// index.train(opt.numTrain, trainVecs.data()); + index.setNumProbes(opt.nprobe); + + std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl; +} + + +void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m, + std::vector &addVecs, std::vector &queryVecs) { + + + + faiss::gpu::RmmGpuResources gpu_res; + gpu_res.setDefaultStream(opt.device, raft_handle.get_stream()); + + rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); + raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); + + rmm::device_uvector queryVecsDev(queryVecs.size(), raft_handle.get_stream()); + raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream()); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.k = opt.k; + args.dims = opt.dim; + args.vectors = addVecs.data(); + args.vectorsRowMajor = true; + args.numVectors = opt.numAdd; + args.queries = queryVecs.data(); + args.queriesRowMajor = true; + args.numQueries = opt.numQuery; + args.outDistances = dists; + args.outIndices = inds; + args.outIndicesType = faiss::gpu::IndicesDataType::I64; + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); +} + +void queryTest( + faiss::MetricType metricType, + bool useFloat16CoarseQuantizer, + int dimOverride = -1) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.dim = dimOverride != -1 ? dimOverride : opt.dim; + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + + std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; + + printf("Creating rmm resources\n"); + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + // TODO: Since we are modifying the centroids when adding new vectors, + // the neighbors are no longer going to match completely between CPU + // and the RAFT indexes. We will probably want to perform a bfknn as + // ground truth and then compare the recall for both the RAFT and FAISS + // indices. + + printf("Building raft index\n"); + faiss::gpu::RaftIndexIVFPQ raftIndex( + &res, opt.dim, opt.numCentroids, metricType, config); + + printf("Done.\n"); + + faiss::gpu::GpuIndexIVFPQ gpuIndex( + &res, opt.dim, opt.numCentroids, metricType, config); + + + printf("Creating raft handle\n"); + raft::handle_t raft_handle; + printf("Done\n"); + + std::cout << "Training raft index" << std::endl; + uint32_t r_train_start = raft::curTimeMillis(); + train_index(raft_handle, opt, raftIndex, trainVecs, addVecs); + raft_handle.sync_stream(); + uint32_t r_train_stop = raft::curTimeMillis(); + std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; + + std::cout << "Training gpu index" << std::endl; + uint32_t g_train_start = raft::curTimeMillis(); + train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); + raft_handle.sync_stream(); + uint32_t g_train_stop = raft::curTimeMillis(); + std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; + + std::cout << "Computing ground truth" << std::endl; + rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); + + std::cout << "Done." << std::endl; + raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); + raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); + + rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + uint32_t rstart = raft::curTimeMillis(); + raftIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + raft_dists.data(), + raft_inds.data()); + + raft_handle.sync_stream(); + uint32_t rstop = raft::curTimeMillis(); + std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; + + rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); + rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); + + uint32_t gstart = raft::curTimeMillis(); + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + gpu_dists.data(), + gpu_inds.data()); + + raft_handle.sync_stream(); + uint32_t gstop = raft::curTimeMillis(); + + std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; + + // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. + + raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout); + raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout); + +// raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); +// raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); + +// +// bool compFloat16 = useFloat16CoarseQuantizer; +// faiss::gpu::compareIndices( +// cpuIndex, +// gpuIndex, +// opt.numQuery, +// opt.dim, +// opt.k, +// opt.toString(), +// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, +// // FIXME: the fp16 bounds are +// // useless when math (the accumulator) is +// // in fp16. Figure out another way to test +// compFloat16 ? 0.70f : 0.1f, +// compFloat16 ? 0.65f : 0.015f); + } +} + +void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 quantizerL2(opt.dim); + faiss::IndexFlatIP quantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&quantizerL2 + : (faiss::Index*)&quantizerIP; + + faiss::IndexIVFFlat cpuIndex( + quantizer, opt.dim, opt.numCentroids, metricType); + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.nprobe = opt.nprobe; + + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFPQ gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.setNumProbes(opt.nprobe); + + cpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); + } +} + +void copyToTest(bool useFloat16CoarseQuantizer) { + Options opt; + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFPQ gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.setNumProbes(opt.nprobe); + + // use garbage values to see if we overwrite then + faiss::IndexFlatL2 cpuQuantizer(1); + faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); + cpuIndex.nprobe = 1; + + gpuIndex.copyTo(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); +} + +void copyFromTest(bool useFloat16CoarseQuantizer) { + Options opt; + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + + faiss::IndexFlatL2 cpuQuantizer(opt.dim); + faiss::IndexIVFFlat cpuIndex( + &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2); + cpuIndex.nprobe = opt.nprobe; + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); + + // use garbage values to see if we overwrite then + faiss::gpu::RmmGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + + faiss::gpu::RaftIndexIVFPQ gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); + gpuIndex.setNumProbes(1); + + gpuIndex.copyFrom(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); +} + +//TEST(TestRaftIndexIVFPQ, Float32_32_Add_L2) { +// addTest(faiss::METRIC_L2, false); +// printf("Finished addTest(faiss::METRIC_L2, false)\n"); +//} +// +//TEST(TestRaftIndexIVFPQ, Float32_32_Add_IP) { +// addTest(faiss::METRIC_INNER_PRODUCT, false); +// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); +//} +// +//TEST(TestRaftIndexIVFPQ, Float16_32_Add_L2) { +// addTest(faiss::METRIC_L2, true); +// printf("Finished addTest(faiss::METRIC_L2, true)\n"); +//} +// +//TEST(TestRaftIndexIVFPQ, Float16_32_Add_IP) { +// addTest(faiss::METRIC_INNER_PRODUCT, true); +// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); +//} + +// +// General query tests +// + +TEST(TestRaftIndexIVFPQ, Float32_Query_L2) { +queryTest(faiss::METRIC_L2, false); +printf("Finished queryTest(faiss::METRIC_L2, false);\n"); +} + +//TEST(TestRaftIndexIVFPQ, Float32_Query_IP) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); +//} + +// float16 coarse quantizer + +TEST(TestRaftIndexIVFPQ, Float16_32_Query_L2) { +queryTest(faiss::METRIC_L2, true); +printf("Finished queryTest(faiss::METRIC_L2, true)\n"); +} + +//TEST(TestRaftIndexIVFPQ, Float16_32_Query_IP) { +// queryTest(faiss::METRIC_INNER_PRODUCT, true); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); +//} + +// +// There are IVF list scanning specializations for 64-d and 128-d that we +// make sure we explicitly test here +// + +TEST(TestRaftIndexIVFPQ, Float32_Query_L2_64) { +queryTest(faiss::METRIC_L2, false, 64); +printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n"); +} + +//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_64) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); +//} + +TEST(TestRaftIndexIVFPQ, Float32_Query_L2_128) { +queryTest(faiss::METRIC_L2, false, 128); +printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n"); +} + +//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_128) { +// queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); +// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); +//} + +// +// Copy tests +// + +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFPQ, Float32_32_CopyTo) { +// copyToTest(false); +// printf("Finished copyToTest(false)\n"); +// } + +//TEST(TestRaftIndexIVFPQ, Float32_32_CopyFrom) { +// copyFromTest(false); +// printf("Finished copyFromTest(false)\n"); +//} + +//TEST(TestRaftIndexIVFPQ, Float32_negative) { +// Options opt; +// +// auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); +// auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); +// +// // Put all vecs on negative side +// for (auto& f : trainVecs) { +// f = std::abs(f) * -1.0f; +// } +// +// for (auto& f : addVecs) { +// f *= std::abs(f) * -1.0f; +// } +// +// faiss::IndexFlatIP quantizerIP(opt.dim); +// faiss::Index* quantizer = (faiss::Index*)&quantizerIP; +// +// faiss::IndexIVFFlat cpuIndex( +// quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); +// cpuIndex.train(opt.numTrain, trainVecs.data()); +// cpuIndex.add(opt.numAdd, addVecs.data()); +// cpuIndex.nprobe = opt.nprobe; +// +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); +// +// faiss::gpu::GpuIndexIVFPQConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// +// faiss::gpu::RaftIndexIVFPQ gpuIndex( +// &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); +// gpuIndex.copyFrom(&cpuIndex); +// gpuIndex.setNumProbes(opt.nprobe); +// +// // Construct a positive test set +// auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); +// +// // Put all vecs on positive size +// for (auto& f : queryVecs) { +// f = std::abs(f); +// } +// +// bool compFloat16 = false; +// faiss::gpu::compareIndices( +// queryVecs, +// cpuIndex, +// gpuIndex, +// opt.numQuery, +// opt.dim, +// opt.k, +// opt.toString(), +// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, +// // FIXME: the fp16 bounds are +// // useless when math (the accumulator) is +// // in fp16. Figure out another way to test +// compFloat16 ? 0.99f : 0.1f, +// compFloat16 ? 0.65f : 0.015f); +//} + +// +// NaN tests +// + +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFPQ, QueryNaN) { +// Options opt; + +// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, +// opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, +// opt.dim); + +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); + +// faiss::gpu::GpuIndexIVFPQConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +// faiss::gpu::RaftIndexIVFPQ gpuIndex( +// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +// gpuIndex.setNumProbes(opt.nprobe); + +// gpuIndex.train(opt.numTrain, trainVecs.data()); +// gpuIndex.add(opt.numAdd, addVecs.data()); + +// int numQuery = 10; +// std::vector nans( +// numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + +// std::vector distances(numQuery * opt.k, 0); +// std::vector indices(numQuery * opt.k, 0); + +// gpuIndex.search( +// numQuery, nans.data(), opt.k, distances.data(), indices.data()); + +// for (int q = 0; q < numQuery; ++q) { +// for (int k = 0; k < opt.k; ++k) { +// EXPECT_EQ(indices[q * opt.k + k], -1); +// EXPECT_EQ( +// distances[q * opt.k + k], +// std::numeric_limits::max()); +// } +// } +// } + +/** TODO: test crashes */ +// TEST(TestRaftIndexIVFPQ, AddNaN) { +// Options opt; + +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); + +// faiss::gpu::GpuIndexIVFPQConfig config; +// config.device = opt.device; +// config.indicesOptions = opt.indicesOpt; +// config.flatConfig.useFloat16 = faiss::gpu::randBool(); + +// faiss::gpu::RaftIndexIVFPQ gpuIndex( +// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); +// gpuIndex.setNumProbes(opt.nprobe); + +// int numNans = 10; +// std::vector nans( +// numNans * opt.dim, std::numeric_limits::quiet_NaN()); + +// // Make one vector valid (not the first vector, in order to test offset +// // issues), which should actually add +// for (int i = 0; i < opt.dim; ++i) { +// nans[opt.dim + i] = i; +// } + +// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, +// opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data()); + +// // should not crash +// EXPECT_EQ(gpuIndex.ntotal, 0); +// gpuIndex.add(numNans, nans.data()); + +// std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, +// opt.dim); std::vector distance(opt.numQuery * opt.k, 0); +// std::vector indices(opt.numQuery * opt.k, 0); + +// // should not crash +// gpuIndex.search( +// opt.numQuery, +// queryVecs.data(), +// opt.k, +// distance.data(), +// indices.data()); +// } + +//TEST(TestRaftIndexIVFPQ, UnifiedMemory) { +// // Construct on a random device to test multi-device, if we have +// // multiple devices +// int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); +// +// if (!faiss::gpu::getFullUnifiedMemSupport(device)) { +// return; +// } +// +// int dim = 128; +// +// int numCentroids = 256; +// // Unfortunately it would take forever to add 24 GB in IVFPQ data, +// // so just perform a small test with data allocated in the unified +// // memory address space +// size_t numAdd = 10000; +// size_t numTrain = numCentroids * 40; +// int numQuery = 10; +// int k = 10; +// int nprobe = 8; +// +// std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); +// std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); +// +// faiss::IndexFlatL2 quantizer(dim); +// faiss::IndexIVFFlat cpuIndex( +// &quantizer, dim, numCentroids, faiss::METRIC_L2); +// +// cpuIndex.train(numTrain, trainVecs.data()); +// cpuIndex.add(numAdd, addVecs.data()); +// cpuIndex.nprobe = nprobe; +// +// faiss::gpu::RmmGpuResources res; +// res.noTempMemory(); +// +// faiss::gpu::GpuIndexIVFPQConfig config; +// config.device = device; +// config.memorySpace = faiss::gpu::MemorySpace::Unified; +// +// faiss::gpu::RaftIndexIVFPQ gpuIndex( +// &res, dim, numCentroids, faiss::METRIC_L2, config); +// gpuIndex.copyFrom(&cpuIndex); +// gpuIndex.setNumProbes(nprobe); +// +// faiss::gpu::compareIndices( +// cpuIndex, +// gpuIndex, +// numQuery, +// dim, +// k, +// "Unified Memory", +// kF32MaxRelErr, +// 0.1f, +// 0.015f); +//} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + + // just run with a fixed test seed + faiss::gpu::setTestSeed(100); + + return RUN_ALL_TESTS(); +} From d7ca6b48711ce53a5edd8fffc119122d520390c0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 10 Oct 2022 16:50:31 -0400 Subject: [PATCH 27/87] Adding FAISS_ENABLE_RAFT option to INSTALL.md --- INSTALL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index 9d928a4ea4..e0b221a812 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -94,6 +94,9 @@ Several options can be passed to CMake, among which: values are `ON` and `OFF`), - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings (possible values are `ON` and `OFF`), + - `-DFAISS_ENABLE_RAFT=ON` in order to enable building the RAFT implementations + of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible + values are `ON` and `OFF`) - `-DBUILD_TESTING=OFF` in order to disable building C++ tests, - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values are `ON` and `OFF`), From 9875dad4071c99f91006e9d23a76030350d10728 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 10 Oct 2022 17:08:26 -0400 Subject: [PATCH 28/87] Making build.sh work for quick building of proposal --- CMakeLists.txt | 8 +++++++- build.sh | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ded2d8635a..85a5a0e46b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,12 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) +set(FAISS_LANGUAGES CXX) + +if(FAISS_ENABLE_GPU) + list(APPEND FAISS_LANGUAGES CUDA) +endif() + if(FAISS_ENABLE_RAFT) include(cmake/thirdparty/fetch_rapids.cmake) include(rapids-cmake) @@ -21,7 +27,7 @@ project(faiss VERSION 1.7.2 DESCRIPTION "A library for efficient similarity search and clustering of dense vectors." HOMEPAGE_URL "https://github.com/facebookresearch/faiss" - LANGUAGES CXX) + LANGUAGES ${FAISS_LANGUAGES}) include(GNUInstallDirs) set(CMAKE_CXX_STANDARD 17) diff --git a/build.sh b/build.sh index a37468d665..80341ebcfd 100755 --- a/build.sh +++ b/build.sh @@ -1,12 +1,17 @@ #!/bin/bash BUILD_TYPE=Release +BUILD_DIR=build/ -RAFT_REPO_REL="/share/workspace/rapids_projects/raft" -RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`" - +RAFT_REPO_REL="" +EXTRA_CMAKE_ARGS="" set -e +if [[ ${RAFT_REPO_REL} != "" ]]; then + RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`" + EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}" +fi + if [ "$1" == "clean" ]; then rm -rf build exit 0 @@ -22,14 +27,15 @@ if [ "$1" == "test-raft" ]; then exit 0 fi -mkdir -p build/ && cd build/ +mkdir -p $BUILD_DIR +cd $BUILD_DIR + cmake \ -DFAISS_ENABLE_GPU=ON \ + -DFAISS_ENABLE_RAFT=ON \ -DFAISS_ENABLE_PYTHON=OFF \ -DBUILD_TESTING=ON \ -DBUILD_SHARED_LIBS=OFF \ - -DCPM_raft_SOURCE=${RAFT_REPO_REL} \ - -DFAISS_ENABLE_RAFT=ON \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DFAISS_OPT_LEVEL=avx2 \ -DRAFT_NVTX=OFF \ @@ -38,6 +44,7 @@ cmake \ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + ${EXTRA_CMAKE_ARGS} \ ../ cmake --build . -j12 From c09d09b7d2c00d61b395c9434c645f5c05dbd7bd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 11 Oct 2022 12:52:30 -0400 Subject: [PATCH 29/87] Merging upstream --- faiss/gpu/GpuIndexIVF.h | 8 +++---- faiss/gpu/GpuIndexIVFFlat.cu | 2 -- faiss/gpu/raft/RaftIndexIVFFlat.cu | 37 ++++++++++++++++++++++-------- faiss/gpu/raft/RaftIndexIVFFlat.h | 15 +++++++++++- faiss/gpu/raft/RaftIndexIVFPQ.cu | 27 ++++++++++++++++++++-- faiss/gpu/raft/RaftIndexIVFPQ.h | 15 +++++++++++- 6 files changed, 84 insertions(+), 20 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index 4a80cdcb06..a962ebf406 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -75,10 +75,10 @@ class GpuIndexIVF : public GpuIndex { virtual void updateQuantizer() = 0; /// Returns the number of inverted lists we're managing - int getNumLists() const; + virtual int getNumLists() const; /// Returns the number of vectors present in a particular inverted list - int getListLength(int listId) const; + virtual int getListLength(int listId) const; /// Return the encoded vector data contained in a particular inverted list, /// for debugging purposes. @@ -86,12 +86,12 @@ class GpuIndexIVF : public GpuIndex { /// GPU-side representation. /// Otherwise, it is converted to the CPU format. /// compliant format, while the native GPU format may differ. - std::vector getListVectorData(int listId, bool gpuFormat = false) + virtual std::vector getListVectorData(int listId, bool gpuFormat = false) const; /// Return the vector indices contained in a particular inverted list, for /// debugging purposes. - std::vector getListIndices(int listId) const; + virtual std::vector getListIndices(int listId) const; /// Sets the number of list probes per query void setNumProbes(int nprobe); diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 17537e7caa..f556241839 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -134,8 +134,6 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { baseIndex_ = std::static_pointer_cast(index_); updateQuantizer(); - raft::print_device_vector("faiss centers", quantizer->getGpuData()->vectors(), 50, std::cout); - // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); } diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/raft/RaftIndexIVFFlat.cu index fe77aa1d1e..a877e2419d 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/raft/RaftIndexIVFFlat.cu @@ -5,6 +5,7 @@ * LICENSE file in the root directory of this source tree. */ +#include // for SearchParametersIVF #include #include #include @@ -45,6 +46,21 @@ RaftIndexIVFFlat::RaftIndexIVFFlat( std::cout << "In raft index constructor" << std::endl; } + +RaftIndexIVFFlat::RaftIndexIVFFlat( + GpuResourcesProvider* provider, + Index *coarse_quantizer, + int dims, + int nlist, + faiss::MetricType metric, + GpuIndexIVFFlatConfig config) + : GpuIndexIVFFlat(provider, coarse_quantizer, dims, nlist, metric, config), + raft_handle(resources_->getDefaultStream(config_.device)) { + + std::cout << "In raft index constructor" << std::endl; +} + + RaftIndexIVFFlat::~RaftIndexIVFFlat() { RaftIndexIVFFlat::reset(); } @@ -53,15 +69,15 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { DeviceScope scope(config_.device); GpuIndex::copyFrom(index); FAISS_ASSERT(index->nlist > 0); - FAISS_THROW_IF_NOT_FMT( - index->nlist <= (Index::idx_t)std::numeric_limits::max(), - "GPU index only supports %zu inverted lists", - (size_t)std::numeric_limits::max()); - FAISS_THROW_IF_NOT_FMT( - index->nprobe > 0 && index->nprobe <= getMaxKSelection(), - "GPU index only supports nprobe <= %zu; passed %zu", - (size_t)getMaxKSelection(), - index->nprobe); +// FAISS_THROW_IF_NOT_FMT( +// index->nlist <= (Index::idx_t)std::numeric_limits::max(), +// "GPU index only supports %zu inverted lists", +// (size_t)std::numeric_limits::max()); +// FAISS_THROW_IF_NOT_FMT( +// index->nprobe > 0 && index->nprobe <= getMaxKSelection(), +// "GPU index only supports nprobe <= %zu; passed %zu", +// (size_t)getMaxKSelection(), +// index->nprobe); /** * TODO: Copy centers and center norms from quantizer @@ -305,7 +321,8 @@ void RaftIndexIVFFlat::searchImpl_( const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + const SearchParameters *params) const { raft::common::nvtx::range fun_scope( "RaftIndexIVFFlat::searchImpl_ (%ld)", n); diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/raft/RaftIndexIVFFlat.h index d9b6e498ad..eaeabafce6 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.h +++ b/faiss/gpu/raft/RaftIndexIVFFlat.h @@ -7,6 +7,7 @@ #pragma once +#include // for SearchParametersIVF #include #include @@ -33,6 +34,7 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { const faiss::IndexIVFFlat* index, GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); + /// Constructs a new instance with an empty flat quantizer; the user /// provides the number of lists desired. RaftIndexIVFFlat( @@ -42,6 +44,16 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { faiss::MetricType metric, GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); + /// Constructs a new instance with a provided CPU or GPU coarse quantizer; + /// the user provides the number of IVF lists desired. + RaftIndexIVFFlat( + GpuResourcesProvider* provider, + Index* coarseQuantizer, + int dims, + int nlist, + faiss::MetricType metric = faiss::METRIC_L2, + GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); + ~RaftIndexIVFFlat() override; /// Clears out all inverted lists, but retains the coarse centroid @@ -87,7 +99,8 @@ class RaftIndexIVFFlat : public GpuIndexIVFFlat { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + const SearchParameters *params) const override; void rebuildRaftIndex(const float* x, Index::idx_t n_rows); diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/raft/RaftIndexIVFPQ.cu index 8620ec8e1f..a7056ff5ae 100644 --- a/faiss/gpu/raft/RaftIndexIVFPQ.cu +++ b/faiss/gpu/raft/RaftIndexIVFPQ.cu @@ -70,6 +70,28 @@ RaftIndexIVFPQ::RaftIndexIVFPQ( this->is_trained = false; } +RaftIndexIVFPQ::RaftIndexIVFPQ( + GpuResourcesProvider* provider, + Index *coarse_quantizer, + int dims, + int nlist, + int subQuantizers, + int bitsPerCode, + faiss::MetricType metric, + GpuIndexIVFPQConfig config) + : GpuIndexIVFPQ(provider, coarse_quantizer, dims, nlist, subQuantizers, bitsPerCode, metric, config), + pq(dims, subQuantizers, bitsPerCode), + ivfpqConfig_(config), + usePrecomputedTables_(config.usePrecomputedTables), + subQuantizers_(subQuantizers), + bitsPerCode_(bitsPerCode), + reserveMemoryVecs_(0) { + verifySettings_(); + + // We haven't trained ourselves, so don't construct the PQ index yet + this->is_trained = false; +} + RaftIndexIVFPQ::~RaftIndexIVFPQ() {} void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { @@ -186,7 +208,7 @@ void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) { usePrecomputedTables_ = enable; if (index_) { DeviceScope scope(config_.device); - index_->setPrecomputedCodes(enable); + index_->setPrecomputedCodes(quantizer, enable); } verifySettings_(); @@ -277,7 +299,8 @@ void RaftIndexIVFPQ::searchImpl_( const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + const SearchParameters *params) const { // Device is already set in GpuIndex::search FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/raft/RaftIndexIVFPQ.h index a121681c2c..e7f1b7515c 100644 --- a/faiss/gpu/raft/RaftIndexIVFPQ.h +++ b/faiss/gpu/raft/RaftIndexIVFPQ.h @@ -46,6 +46,18 @@ class RaftIndexIVFPQ : public GpuIndexIVFPQ { faiss::MetricType metric, GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); + + /// Construct an empty index + RaftIndexIVFPQ( + GpuResourcesProvider* provider, + Index *coarse_quantizer, + int dims, + int nlist, + int subQuantizers, + int bitsPerCode, + faiss::MetricType metric, + GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); + ~RaftIndexIVFPQ() override; /// Reserve space on the GPU for the inverted lists for `num` @@ -119,7 +131,8 @@ class RaftIndexIVFPQ : public GpuIndexIVFPQ { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + const SearchParameters *params) const override; /// Throws errors if configuration settings are improper void verifySettings_() const; From 0081ed9ea7457b9b23107bb7ba18487320c4b528 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 19 Oct 2022 16:14:11 -0400 Subject: [PATCH 30/87] Integrating more deeply with `use_raft` option in the index config that branches out to IVFFlat or RaftIVFFlat depending on the setting --- CMakeLists.txt | 4 + faiss/gpu/CMakeLists.txt | 6 +- faiss/gpu/GpuIndex.h | 3 + faiss/gpu/GpuIndexIVF.cu | 27 ++- faiss/gpu/GpuIndexIVF.h | 2 +- faiss/gpu/GpuIndexIVFFlat.cu | 72 ++++--- faiss/gpu/GpuIndexIVFFlat.h | 14 ++ faiss/gpu/GpuResources.cpp | 4 + faiss/gpu/GpuResources.h | 8 + faiss/gpu/StandardGpuResources.cpp | 31 ++++ faiss/gpu/StandardGpuResources.h | 20 +- faiss/gpu/impl/IVFFlat.cu | 3 +- faiss/gpu/impl/IVFFlat.cuh | 13 +- faiss/gpu/impl/raft/RaftIVFFlat.cu | 175 ++++++++++++++++++ faiss/gpu/impl/raft/RaftIVFFlat.cuh | 86 +++++++++ faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.cu | 2 +- faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.h | 0 faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.cu | 2 +- faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.h | 0 faiss/gpu/{ => impl}/raft/RmmGpuResources.hpp | 20 ++ 20 files changed, 448 insertions(+), 44 deletions(-) create mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cu create mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cuh rename faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.cu (99%) rename faiss/gpu/{ => impl}/raft/RaftIndexIVFFlat.h (100%) rename faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.cu (99%) rename faiss/gpu/{ => impl}/raft/RaftIndexIVFPQ.h (100%) rename faiss/gpu/{ => impl}/raft/RmmGpuResources.hpp (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85a5a0e46b..a0ff1eceb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,7 +30,11 @@ project(faiss LANGUAGES ${FAISS_LANGUAGES}) include(GNUInstallDirs) +if(FAISS_ENABLE_RAFT) set(CMAKE_CXX_STANDARD 17) +else() +set(CMAKE_CXX_STANDARD 11) +endif() list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index ce72786308..8b373aecb8 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -165,8 +165,10 @@ set(FAISS_GPU_HEADERS ) if(FAISS_ENABLE_RAFT) - list(APPEND FAISS_GPU_HEADERS raft/RaftIndexIVFFlat.h raft/RaftIndexIVFPQ.h) - list(APPEND FAISS_GPU_SRC raft/RaftIndexIVFFlat.cu raft/RaftIndexIVFPQ.cu) + list(APPEND FAISS_GPU_HEADERS impl/raft/RaftIndexIVFFlat.h impl/raft/RaftIndexIVFPQ.h + impl/raft/RaftIVFFlat.cuh) + list(APPEND FAISS_GPU_SRC impl/raft/RaftIndexIVFFlat.cu impl/raft/RaftIndexIVFPQ.cu + impl/raft/RaftIVFFlat.cu) endif() # Export FAISS_GPU_HEADERS variable to parent scope. diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h index 0f50d491f0..f3b42d0f88 100644 --- a/faiss/gpu/GpuIndex.h +++ b/faiss/gpu/GpuIndex.h @@ -23,6 +23,9 @@ struct GpuIndexConfig { /// On Pascal and above (CC 6+) architectures, allows GPUs to use /// more memory than is available on the GPU. MemorySpace memorySpace; + + /// Should the index dispatch down to RAFT? + bool use_raft = false; }; class GpuIndex : public faiss::Index { diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 6813bdf0b8..ff4eb974b9 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -459,16 +459,29 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) { printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); } - // leverage the CPU-side k-means code, which works for the GPU - // flat index as well - quantizer->reset(); - Clustering clus(this->d, nlist, this->cp); - clus.verbose = verbose; - clus.train(n, x, *quantizer); - quantizer->is_trained = true; + if(config_.use_raft) { + /** + * TODO: Plug in clustering logic here. + * + * Essentially what we need here is to use `x` as the training data set + * to train the k-means centroids and add them to the quantizer + * implementation. + */ + + + } else { + // leverage the CPU-side k-means code, which works for the GPU + // flat index as well + quantizer->reset(); + Clustering clus(this->d, nlist, this->cp); + clus.verbose = verbose; + clus.train(n, x, *quantizer); + quantizer->is_trained = true; + } + FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index a962ebf406..aaf5374314 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -128,7 +128,7 @@ class GpuIndexIVF : public GpuIndex { protected: void verifyIVFSettings_() const; bool addImplRequiresIDs_() const override; - void trainQuantizer_(Index::idx_t n, const float* x); + virtual void trainQuantizer_(Index::idx_t n, const float* x); /// Called from GpuIndex for add/add_with_ids void addImpl_(int n, const float* x, const Index::idx_t* ids) override; diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index f556241839..642058c4d4 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -70,11 +71,10 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( reserveMemoryVecs_(0) { // We could have been passed an already trained coarse quantizer. There is // no other quantizer that we need to train, so this is sufficient + if (this->is_trained) { FAISS_ASSERT(this->quantizer); - - index_.reset(new IVFFlat( - resources_.get(), + set_index_(resources_.get(), this->d, this->nlist, this->metric_type, @@ -83,7 +83,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( nullptr, // no scalar quantizer ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, - config_.memorySpace)); + config_.memorySpace); baseIndex_ = std::static_pointer_cast(index_); updateQuantizer(); } @@ -91,6 +91,32 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( GpuIndexIVFFlat::~GpuIndexIVFFlat() {} +void GpuIndexIVFFlat::set_index_(GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) { + if(config_.use_raft) { + index_.reset(new RaftIVFFlat( + resources, dim, nlist, metric, metricArg, useResidual, + scalarQ, interleavedLayout, indicesOptions, space)); + } else { + index_.reset(new IVFFlat( + resources, dim, nlist, metric, metricArg, useResidual, + scalarQ, interleavedLayout, indicesOptions, space)); + } + + baseIndex_ = std::static_pointer_cast(index_); + updateQuantizer(); + +} + void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { DeviceScope scope(config_.device); @@ -120,8 +146,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_ASSERT(this->is_trained); // Copy our lists as well - index_.reset(new IVFFlat( - resources_.get(), + set_index_(resources_.get(), this->d, this->nlist, index->metric_type, @@ -130,9 +155,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { nullptr, // no scalar quantizer ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, - config_.memorySpace)); - baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); + config_.memorySpace); + // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); @@ -210,8 +234,7 @@ void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) { FAISS_ASSERT(!index_); // FIXME: GPUize more of this - // First, make sure that the data is resident on the CPU, if it is not on - // the CPU, as we depend upon parts of the CPU code + // First, make sure that the data is resident on the CPU, if it is not on the CPU, as we depend upon parts of the CPU code auto hostData = toHost( (float*)x, resources_->getDefaultStream(config_.device), @@ -220,21 +243,18 @@ void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) { trainQuantizer_(n, hostData.data()); // The quantizer is now trained; construct the IVF index - index_.reset(new IVFFlat( - resources_.get(), - this->d, - this->nlist, - this->metric_type, - this->metric_arg, - false, // no residual - nullptr, // no scalar quantizer - ivfFlatConfig_.interleavedLayout, - ivfFlatConfig_.indicesOptions, - config_.memorySpace)); - baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); - - if (reserveMemoryVecs_) { + set_index_(resources_.get(), + this->d, + this->nlist, + this->metric_type, + this->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace); + + if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index 4ab6f88ef0..a519f11cc2 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -88,6 +89,19 @@ class GpuIndexIVFFlat : public GpuIndexIVF { void train(Index::idx_t n, const float* x) override; protected: + + void set_index_(GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space); + /// Our configuration options const GpuIndexIVFFlatConfig ivfFlatConfig_; diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp index b3dca0895d..e964d13155 100644 --- a/faiss/gpu/GpuResources.cpp +++ b/faiss/gpu/GpuResources.cpp @@ -153,6 +153,10 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() { return getDefaultStream(getCurrentDevice()); } +raft::handle_t &GpuResources::getRaftHandleCurrentDevice() const { + return getRaftHandle(getCurrentDevice()); +} + std::vector GpuResources::getAlternateStreamsCurrentDevice() { return getAlternateStreams(getCurrentDevice()); } diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index 3ae2dfbe19..8824791f12 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -10,6 +10,8 @@ #include #include #include + +#include #include #include #include @@ -190,6 +192,12 @@ class GpuResources { /// given device virtual cudaStream_t getDefaultStream(int device) = 0; + /// Returns the raft handle for the given device which can be used to + /// make calls to other raft primitives. + virtual raft::handle_t &getRaftHandle(int device) const; + + raft::handle_t &getRaftHandleCurrentDevice() const; + /// Overrides the default stream for a device to the user-supplied stream. /// The resources object does not own this stream (i.e., it will not destroy /// it). diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index 80146a2e59..6120fd0bc7 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -5,6 +5,10 @@ * LICENSE file in the root directory of this source tree. */ +#ifdef FAISS_ENABLE_RAFT +#include +#endif + #include #include #include @@ -313,6 +317,11 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { defaultStreams_[device] = defaultStream; +#ifdef FAISS_ENABLE_RAFT + raft::handle_t handle(defaultStream); + raftHandles_[device] = handle; +#endif + cudaStream_t asyncCopyStream = 0; CUDA_VERIFY( cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking)); @@ -375,6 +384,22 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) { return defaultStreams_[device]; } +#ifdef FAISS_ENABLE_RAFT +raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const { + initializeForDevice(device); + + auto it = raftHandles_.find(device); + if (it != raftHandles_.end()) { + // There is a user override stream set + return it->second; + } + + // Otherwise, our base default stream + return raftHandles_[device]; + +} +#endif + std::vector StandardGpuResourcesImpl::getAlternateStreams( int device) { initializeForDevice(device); @@ -600,6 +625,12 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) { return res_->getDefaultStream(device); } +#ifdef FAISS_ENABLE_RAFT + raft::handle_t &StandardGpuResources::getRaftHandle(int device) const { + return res_->getRaftHandle(device); +} +#endif + size_t StandardGpuResources::getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); } diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index d1edfb6673..408221e4b9 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -7,6 +7,8 @@ #pragma once +#include + #include #include #include @@ -58,6 +60,12 @@ class StandardGpuResourcesImpl : public GpuResources { /// this stream upon exit from an index or other Faiss GPU call. cudaStream_t getDefaultStream(int device) override; +#ifdef FAISS_ENABLE_RAFT + /// Returns the raft handle for the given device which can be used to + /// make calls to other raft primitives. + raft::handle_t &getRaftHandle(int device) const override; +#endif + /// Called to change the work ordering streams to the null stream /// for all devices void setDefaultNullStreamAllDevices(); @@ -124,6 +132,11 @@ class StandardGpuResourcesImpl : public GpuResources { /// cuBLAS handle for each device std::unordered_map blasHandles_; +#ifdef FAISS_ENABLE_RAFT + /// raft handle for each device + std::unordered_map raftHandles_; +#endif + /// Pinned memory allocation for use with this GPU void* pinnedMemAlloc_; size_t pinnedMemAllocSize_; @@ -183,10 +196,15 @@ class StandardGpuResources : public GpuResourcesProvider { /// Export a description of memory used for Python std::map>> getMemoryInfo() const; - /// Returns the current default stream cudaStream_t getDefaultStream(int device); +#ifdef FAISS_ENABLE_RAFT + /// Returns the raft handle for the given device which can be used to + /// make calls to other raft primitives. + raft::handle_t &getRaftHandle(int device) const override; +#endif + /// Returns the current amount of temp memory available size_t getTempMemoryAvailable(int device) const; diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index 9d7637d3dd..dd8b1c86a2 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -112,6 +112,7 @@ std::vector IVFFlat::translateCodesFromGpu_( return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); } + void IVFFlat::appendVectors_( Tensor& vecs, Tensor& ivfCentroidResiduals, @@ -126,7 +127,6 @@ void IVFFlat::appendVectors_( // // Append the new encodings // - // Append indices to the IVF lists runIVFIndicesAppend( listIds, @@ -197,6 +197,7 @@ void IVFFlat::search( makeTempAlloc(AllocType::Other, stream), {queries.getSize(0), nprobe, dim_}); + searchCoarseQuantizer_( coarseQuantizer, nprobe, diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh index 27e931be41..01c8f5cc6d 100644 --- a/faiss/gpu/impl/IVFFlat.cuh +++ b/faiss/gpu/impl/IVFFlat.cuh @@ -7,6 +7,11 @@ #pragma once +#ifdef FAISS_ENABLE_RAFT +#include +#include +#endif + #include #include @@ -60,17 +65,17 @@ class IVFFlat : public IVFBase { size_t getCpuVectorsEncodingSize_(int numVecs) const override; /// Translate to our preferred GPU encoding - std::vector translateCodesToGpu_( + virtual std::vector translateCodesToGpu_( std::vector codes, size_t numVecs) const override; /// Translate from our preferred GPU encoding - std::vector translateCodesFromGpu_( + virtual std::vector translateCodesFromGpu_( std::vector codes, size_t numVecs) const override; /// Encode the vectors that we're adding and append to our IVF lists - void appendVectors_( + virtual void appendVectors_( Tensor& vecs, Tensor& ivfCentroidResiduals, Tensor& indices, @@ -84,7 +89,7 @@ class IVFFlat : public IVFBase { /// Shared IVF search implementation, used by both search and /// searchPreassigned - void searchImpl_( + virtual void searchImpl_( Tensor& queries, Tensor& coarseDistances, Tensor& coarseIndices, diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu new file mode 100644 index 0000000000..17c8581ea8 --- /dev/null +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu @@ -0,0 +1,175 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { +namespace gpu { + +RaftIVFFlat::RaftIVFFlat( + GpuResources* res, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) + : IVFFlat(res, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space){} + +RaftIVFFlat::~RaftIVFFlat() {} + +size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { + if (interleavedLayout_) { + // bits per scalar code + int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; + + // bytes to encode a block of 32 vectors (single dimension) + int bytesPerDimBlock = bits * 32 / 8; + + // bytes to fully encode 32 vectors + int bytesPerBlock = bytesPerDimBlock * dim_; + + // number of blocks of 32 vectors we have + int numBlocks = utils::divUp(numVecs, 32); + + // total size to encode numVecs + return bytesPerBlock * numBlocks; + } else { + size_t sizePerVector = + (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); + + return (size_t)numVecs * sizePerVector; + } +} + +size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const { + size_t sizePerVector = + (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); + + return (size_t)numVecs * sizePerVector; +} + +std::vector RaftIVFFlat::translateCodesToGpu_( + std::vector codes, + size_t numVecs) const { + if (!interleavedLayout_) { + // same format + return codes; + } + + int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + + auto up = + unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); + return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); +} + +std::vector RaftIVFFlat::translateCodesFromGpu_( + std::vector codes, + size_t numVecs) const { + if (!interleavedLayout_) { + // same format + return codes; + } + + int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + + auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); + return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); +} + + +void RaftIVFFlat::appendVectors_( + Tensor& vecs, + Tensor& ivfCentroidResiduals, + Tensor& indices, + Tensor& uniqueLists, + Tensor& vectorsByUniqueList, + Tensor& uniqueListVectorStart, + Tensor& uniqueListStartOffset, + Tensor& listIds, + Tensor& listOffset, + cudaStream_t stream) { + // + // Append the new encodings + // + + // TODO: Fill in this logic here +} + +void RaftIVFFlat::searchImpl_( + Tensor& queries, + Tensor& coarseDistances, + Tensor& coarseIndices, + Tensor& ivfCentroids, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs) { + FAISS_ASSERT(storePairs == false); + + auto stream = resources_->getDefaultStreamCurrentDevice(); + + // TODO: Fill in this logic here. + + // If the GPU isn't storing indices (they are on the CPU side), we + // need to perform the re-mapping here + // FIXME: we might ultimately be calling this function with inputs + // from the CPU, these are unnecessary copies + if (indicesOptions_ == INDICES_CPU) { + HostTensor hostOutIndices(outIndices, stream); + + ivfOffsetToUserIndex( + hostOutIndices.data(), + numLists_, + hostOutIndices.getSize(0), + hostOutIndices.getSize(1), + listOffsetToUserIndex_); + + // Copy back to GPU, since the input to this function is on the + // GPU + outIndices.copyFrom(hostOutIndices, stream); + } +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh new file mode 100644 index 0000000000..16754a7fe8 --- /dev/null +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh @@ -0,0 +1,86 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +#include + +namespace faiss { +namespace gpu { + +class RaftIVFFlat : public IVFFlat { + public: + RaftIVFFlat(GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space); + + ~RaftIVFFlat() override; + + protected: + /// Returns the number of bytes in which an IVF list containing numVecs + /// vectors is encoded on the device. Note that due to padding this is not + /// the same as the encoding size for a subset of vectors in an IVF list; + /// this is the size for an entire IVF list + size_t getGpuVectorsEncodingSize_(int numVecs) const override; + size_t getCpuVectorsEncodingSize_(int numVecs) const override; + + /// Translate to our preferred GPU encoding + std::vector translateCodesToGpu_( + std::vector codes, + size_t numVecs) const override; + + /// Translate from our preferred GPU encoding + std::vector translateCodesFromGpu_( + std::vector codes, + size_t numVecs) const override; + + /// Encode the vectors that we're adding and append to our IVF lists + void appendVectors_( + Tensor& vecs, + Tensor& ivfCentroidResiduals, + Tensor& indices, + Tensor& uniqueLists, + Tensor& vectorsByUniqueList, + Tensor& uniqueListVectorStart, + Tensor& uniqueListStartOffset, + Tensor& listIds, + Tensor& listOffset, + cudaStream_t stream) override; + + /// Shared IVF search implementation, used by both search and + /// searchPreassigned + void searchImpl_( + Tensor& queries, + Tensor& coarseDistances, + Tensor& coarseIndices, + Tensor& ivfCentroids, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs); + + protected: + std::optional> raft_knn_index{std::nullopt}; + +}; + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.cu b/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu similarity index 99% rename from faiss/gpu/raft/RaftIndexIVFFlat.cu rename to faiss/gpu/impl/raft/RaftIndexIVFFlat.cu index a877e2419d..03df717c69 100644 --- a/faiss/gpu/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/faiss/gpu/raft/RaftIndexIVFFlat.h b/faiss/gpu/impl/raft/RaftIndexIVFFlat.h similarity index 100% rename from faiss/gpu/raft/RaftIndexIVFFlat.h rename to faiss/gpu/impl/raft/RaftIndexIVFFlat.h diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.cu b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu similarity index 99% rename from faiss/gpu/raft/RaftIndexIVFPQ.cu rename to faiss/gpu/impl/raft/RaftIndexIVFPQ.cu index a7056ff5ae..f30f34259f 100644 --- a/faiss/gpu/raft/RaftIndexIVFPQ.cu +++ b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/faiss/gpu/raft/RaftIndexIVFPQ.h b/faiss/gpu/impl/raft/RaftIndexIVFPQ.h similarity index 100% rename from faiss/gpu/raft/RaftIndexIVFPQ.h rename to faiss/gpu/impl/raft/RaftIndexIVFPQ.h diff --git a/faiss/gpu/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp similarity index 97% rename from faiss/gpu/raft/RmmGpuResources.hpp rename to faiss/gpu/impl/raft/RmmGpuResources.hpp index e3bc306729..d9e87fb0ad 100644 --- a/faiss/gpu/raft/RmmGpuResources.hpp +++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp @@ -29,6 +29,7 @@ in this file : https://github.com/facebookresearch/faiss/issues/2097 #include #include +#include #include #include #include @@ -221,6 +222,10 @@ class RmmGpuResourcesImpl : public GpuResources { } userDefaultStreams_[device] = stream; + +#ifdef FAISS_ENABLE_RAFT + raftHandles_[device] = raft::handle_t(stream); +#endif }; /// Revert the default stream to the original stream managed by this resources @@ -242,6 +247,9 @@ class RmmGpuResourcesImpl : public GpuResources { } userDefaultStreams_.erase(device); +#ifdef FAISS_ENABLE_RAFT + raftHandles_.erase(device); +#endif }; /// Returns the stream for the given device on which all Faiss GPU work is @@ -326,6 +334,8 @@ class RmmGpuResourcesImpl : public GpuResources { alternateStreams_[device] = std::move(deviceStreams); // Create cuBLAS handle + + // TODO: We need to be able to use this cublas handle within the raft handle cublasHandle_t blasHandle = 0; auto blasStatus = cublasCreate(&blasHandle); FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); @@ -492,6 +502,12 @@ class RmmGpuResourcesImpl : public GpuResources { return defaultStreams_.count(device) != 0; }; + std::unique_ptr getRaftHandle(int device) const { + auto it = raftHandles_.find(device); + FAISS_ASSERT(it != raftHandles_.end()); + return it->second; + } + /// Adjust the default temporary memory allocation based on the total GPU /// memory size static size_t getDefaultTempMemForGPU(int device, size_t requested) @@ -562,6 +578,10 @@ class RmmGpuResourcesImpl : public GpuResources { // pinned_memory_resource std::unique_ptr pmr; + + /// Our raft handle that maintains additional library resources, one per each device + std::unordered_map> raftHandles_; + }; /// Default implementation of GpuResources that allocates a cuBLAS From a7e0cddcab4c513f7d9d92ebe313dee89a017dd5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 19 Oct 2022 17:23:29 -0400 Subject: [PATCH 31/87] IVF Flat --- faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index 8d784c0593..21ac260887 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -7,8 +7,8 @@ #include #include -#include -#include +#include +#include #include #include From fbf7e3425448bdf9a72316dcd943769c6ec7c39e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 19 Oct 2022 17:57:18 -0400 Subject: [PATCH 32/87] More updates --- faiss/gpu/GpuResources.h | 1 - faiss/gpu/StandardGpuResources.cpp | 8 -------- faiss/gpu/StandardGpuResources.h | 8 +------- faiss/gpu/impl/raft/RmmGpuResources.hpp | 7 ------- 4 files changed, 1 insertion(+), 23 deletions(-) diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index 8824791f12..5d627fcb09 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -195,7 +195,6 @@ class GpuResources { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. virtual raft::handle_t &getRaftHandle(int device) const; - raft::handle_t &getRaftHandleCurrentDevice() const; /// Overrides the default stream for a device to the user-supplied stream. diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index 6120fd0bc7..fd99074eda 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -5,9 +5,7 @@ * LICENSE file in the root directory of this source tree. */ -#ifdef FAISS_ENABLE_RAFT #include -#endif #include #include @@ -317,10 +315,8 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { defaultStreams_[device] = defaultStream; -#ifdef FAISS_ENABLE_RAFT raft::handle_t handle(defaultStream); raftHandles_[device] = handle; -#endif cudaStream_t asyncCopyStream = 0; CUDA_VERIFY( @@ -384,7 +380,6 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) { return defaultStreams_[device]; } -#ifdef FAISS_ENABLE_RAFT raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const { initializeForDevice(device); @@ -398,7 +393,6 @@ raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const { return raftHandles_[device]; } -#endif std::vector StandardGpuResourcesImpl::getAlternateStreams( int device) { @@ -625,11 +619,9 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) { return res_->getDefaultStream(device); } -#ifdef FAISS_ENABLE_RAFT raft::handle_t &StandardGpuResources::getRaftHandle(int device) const { return res_->getRaftHandle(device); } -#endif size_t StandardGpuResources::getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 408221e4b9..115b34a2fd 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -60,11 +60,9 @@ class StandardGpuResourcesImpl : public GpuResources { /// this stream upon exit from an index or other Faiss GPU call. cudaStream_t getDefaultStream(int device) override; -#ifdef FAISS_ENABLE_RAFT /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. raft::handle_t &getRaftHandle(int device) const override; -#endif /// Called to change the work ordering streams to the null stream /// for all devices @@ -132,10 +130,8 @@ class StandardGpuResourcesImpl : public GpuResources { /// cuBLAS handle for each device std::unordered_map blasHandles_; -#ifdef FAISS_ENABLE_RAFT /// raft handle for each device std::unordered_map raftHandles_; -#endif /// Pinned memory allocation for use with this GPU void* pinnedMemAlloc_; @@ -199,11 +195,9 @@ class StandardGpuResources : public GpuResourcesProvider { /// Returns the current default stream cudaStream_t getDefaultStream(int device); -#ifdef FAISS_ENABLE_RAFT /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - raft::handle_t &getRaftHandle(int device) const override; -#endif + raft::handle_t &getRaftHandle(int device) const; /// Returns the current amount of temp memory available size_t getTempMemoryAvailable(int device) const; diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp index d9e87fb0ad..409e33f402 100644 --- a/faiss/gpu/impl/raft/RmmGpuResources.hpp +++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp @@ -222,10 +222,6 @@ class RmmGpuResourcesImpl : public GpuResources { } userDefaultStreams_[device] = stream; - -#ifdef FAISS_ENABLE_RAFT - raftHandles_[device] = raft::handle_t(stream); -#endif }; /// Revert the default stream to the original stream managed by this resources @@ -247,9 +243,6 @@ class RmmGpuResourcesImpl : public GpuResources { } userDefaultStreams_.erase(device); -#ifdef FAISS_ENABLE_RAFT - raftHandles_.erase(device); -#endif }; /// Returns the stream for the given device on which all Faiss GPU work is From a9b69638c05a0b175e16983f58284f4ae9fc2e2c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 18:50:03 -0400 Subject: [PATCH 33/87] Getting things building again. Adding raft handle to gpu resources. --- faiss/gpu/GpuResources.cpp | 2 +- faiss/gpu/GpuResources.h | 4 ++-- faiss/gpu/StandardGpuResources.cpp | 11 +++++------ faiss/gpu/StandardGpuResources.h | 4 ++-- faiss/gpu/impl/raft/RaftIVFFlat.cu | 20 ++++++++++++++++++++ faiss/gpu/impl/raft/RmmGpuResources.hpp | 15 +++++++++++---- 6 files changed, 41 insertions(+), 15 deletions(-) diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp index e964d13155..0129ddafd4 100644 --- a/faiss/gpu/GpuResources.cpp +++ b/faiss/gpu/GpuResources.cpp @@ -153,7 +153,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() { return getDefaultStream(getCurrentDevice()); } -raft::handle_t &GpuResources::getRaftHandleCurrentDevice() const { +raft::handle_t &GpuResources::getRaftHandleCurrentDevice() { return getRaftHandle(getCurrentDevice()); } diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index 5d627fcb09..c286fbae82 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -194,8 +194,8 @@ class GpuResources { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - virtual raft::handle_t &getRaftHandle(int device) const; - raft::handle_t &getRaftHandleCurrentDevice() const; + virtual raft::handle_t &getRaftHandle(int device) = 0; + raft::handle_t &getRaftHandleCurrentDevice(); /// Overrides the default stream for a device to the user-supplied stream. /// The resources object does not own this stream (i.e., it will not destroy diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index fd99074eda..c593264ab0 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -315,8 +315,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { defaultStreams_[device] = defaultStream; - raft::handle_t handle(defaultStream); - raftHandles_[device] = handle; + raftHandles_.emplace(std::make_pair(device, defaultStream)); cudaStream_t asyncCopyStream = 0; CUDA_VERIFY( @@ -380,16 +379,16 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) { return defaultStreams_[device]; } -raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) const { +raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) { initializeForDevice(device); auto it = raftHandles_.find(device); if (it != raftHandles_.end()) { - // There is a user override stream set + // There is a user override handle set return it->second; } - // Otherwise, our base default stream + // Otherwise, our base default handle return raftHandles_[device]; } @@ -619,7 +618,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) { return res_->getDefaultStream(device); } - raft::handle_t &StandardGpuResources::getRaftHandle(int device) const { + raft::handle_t &StandardGpuResources::getRaftHandle(int device) { return res_->getRaftHandle(device); } diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 115b34a2fd..e28d89a492 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -62,7 +62,7 @@ class StandardGpuResourcesImpl : public GpuResources { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - raft::handle_t &getRaftHandle(int device) const override; + raft::handle_t &getRaftHandle(int device) override; /// Called to change the work ordering streams to the null stream /// for all devices @@ -197,7 +197,7 @@ class StandardGpuResources : public GpuResourcesProvider { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - raft::handle_t &getRaftHandle(int device) const; + raft::handle_t &getRaftHandle(int device); /// Returns the current amount of temp memory available size_t getTempMemoryAvailable(int device) const; diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu index 17c8581ea8..9f24a244f6 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cu +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu @@ -151,6 +151,26 @@ void RaftIVFFlat::searchImpl_( // TODO: Fill in this logic here. +// // Device is already set in GpuIndex::search +// FAISS_ASSERT(raft_knn_index.has_value()); +// FAISS_ASSERT(n > 0); +// FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); +// +// raft::spatial::knn::ivf_flat::search_params pams; +// pams.n_probes = nprobe; +// raft::spatial::knn::ivf_flat::search( +// raft_handle, +// pams, +// *raft_knn_index, +// const_cast(x), +// static_cast(n), +// static_cast(k), +// labels, +// distances); +// +// raft_handle.sync_stream(); + + // If the GPU isn't storing indices (they are on the CPU side), we // need to perform the re-mapping here // FIXME: we might ultimately be calling this function with inputs diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp index 409e33f402..c22c722a35 100644 --- a/faiss/gpu/impl/raft/RmmGpuResources.hpp +++ b/faiss/gpu/impl/raft/RmmGpuResources.hpp @@ -495,10 +495,17 @@ class RmmGpuResourcesImpl : public GpuResources { return defaultStreams_.count(device) != 0; }; - std::unique_ptr getRaftHandle(int device) const { + raft::handle_t &getRaftHandle(int device) { + initializeForDevice(device); + auto it = raftHandles_.find(device); - FAISS_ASSERT(it != raftHandles_.end()); - return it->second; + if (it != raftHandles_.end()) { + // There is a user override handle set + return it->second; + } + + // Otherwise, our base default handle + return raftHandles_[device]; } /// Adjust the default temporary memory allocation based on the total GPU @@ -573,7 +580,7 @@ class RmmGpuResourcesImpl : public GpuResources { std::unique_ptr pmr; /// Our raft handle that maintains additional library resources, one per each device - std::unordered_map> raftHandles_; + std::unordered_map raftHandles_; }; From b640ba8c74b46f1c12b290908311a1f29eade33e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 26 Oct 2022 17:10:22 -0400 Subject: [PATCH 34/87] Getting FAISS building again w/ RaftIVFFlat --- CMakeLists.txt | 2 +- cmake/thirdparty/get_raft.cmake | 4 +- faiss/gpu/GpuIndexIVFFlat.h | 1 + faiss/gpu/impl/IVFFlat.cu | 8 ++ faiss/gpu/impl/raft/RaftIVFFlat.cu | 122 ++++++++++++++++++---------- faiss/gpu/impl/raft/RaftIVFFlat.cuh | 34 +++++--- 6 files changed, 116 insertions(+), 55 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0ff1eceb6..5273da200a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ include(GNUInstallDirs) if(FAISS_ENABLE_RAFT) set(CMAKE_CXX_STANDARD 17) else() -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) endif() list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 91f53b0f4d..2b7825d193 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -16,8 +16,8 @@ set(RAFT_VERSION "${RAPIDS_VERSION}") -set(RAFT_FORK "rapidsai") -set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}") +set(RAFT_FORK "cjnolet") +set(RAFT_PINNED_TAG "bug-2212-ivf_flat_apis") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index a519f11cc2..c8ab0068da 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -9,6 +9,7 @@ #include #include + #include namespace faiss { diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index dd8b1c86a2..a42e06cde3 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -286,8 +286,16 @@ void IVFFlat::searchPreassigned( void IVFFlat::searchImpl_( Tensor& queries, + + /** + * + */ Tensor& coarseDistances, Tensor& coarseIndices, + + /** + * This is raft::neighbors::ivf_flat::index::centers_ + */ Tensor& ivfCentroids, int k, Tensor& outDistances, diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu index 9f24a244f6..1f01f74242 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cu +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu @@ -5,8 +5,9 @@ * LICENSE file in the root directory of this source tree. */ - +#include #include +#include #include #include @@ -57,6 +58,45 @@ RaftIVFFlat::RaftIVFFlat( RaftIVFFlat::~RaftIVFFlat() {} + +/// Find the approximate k nearest neigbors for `queries` against +/// our database +void RaftIVFFlat::search( + Index* coarseQuantizer, + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) { + + // TODO: We probably don't want to ignore the coarse quantizer here... + + std::uint32_t n = queries.getSize(0); + std::uint32_t cols = queries.getSize(1); + std::uint32_t k_ = k; + + // Device is already set in GpuIndex::search + FAISS_ASSERT(raft_knn_index.has_value()); + FAISS_ASSERT(n > 0); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); + + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + raft::spatial::knn::ivf_flat::search_params pams; + pams.n_probes = nprobe; + + // TODO: + + auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); + auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); + auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); + raft::spatial::knn::ivf_flat::search( + raft_handle, *raft_knn_index, queries_view, + out_inds_view, out_dists_view, pams, k_); + + raft_handle.sync_stream(); +} + + size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { if (interleavedLayout_) { // bits per scalar code @@ -136,26 +176,26 @@ void RaftIVFFlat::appendVectors_( // TODO: Fill in this logic here } -void RaftIVFFlat::searchImpl_( - Tensor& queries, - Tensor& coarseDistances, - Tensor& coarseIndices, - Tensor& ivfCentroids, - int k, - Tensor& outDistances, - Tensor& outIndices, - bool storePairs) { - FAISS_ASSERT(storePairs == false); - - auto stream = resources_->getDefaultStreamCurrentDevice(); - - // TODO: Fill in this logic here. - -// // Device is already set in GpuIndex::search -// FAISS_ASSERT(raft_knn_index.has_value()); -// FAISS_ASSERT(n > 0); -// FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); +//void RaftIVFFlat::searchImpl_( +// Tensor& queries, +// Tensor& coarseDistances, +// Tensor& coarseIndices, +// Tensor& ivfCentroids, +// int k, +// Tensor& outDistances, +// Tensor& outIndices, +// bool storePairs) { +// FAISS_ASSERT(storePairs == false); // +// auto stream = resources_->getDefaultStreamCurrentDevice(); +// +// // TODO: Fill in this logic here. +// +//// // Device is already set in GpuIndex::search +//// FAISS_ASSERT(raft_knn_index.has_value()); +//// FAISS_ASSERT(n > 0); +//// FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); +//// // raft::spatial::knn::ivf_flat::search_params pams; // pams.n_probes = nprobe; // raft::spatial::knn::ivf_flat::search( @@ -169,27 +209,27 @@ void RaftIVFFlat::searchImpl_( // distances); // // raft_handle.sync_stream(); - - - // If the GPU isn't storing indices (they are on the CPU side), we - // need to perform the re-mapping here - // FIXME: we might ultimately be calling this function with inputs - // from the CPU, these are unnecessary copies - if (indicesOptions_ == INDICES_CPU) { - HostTensor hostOutIndices(outIndices, stream); - - ivfOffsetToUserIndex( - hostOutIndices.data(), - numLists_, - hostOutIndices.getSize(0), - hostOutIndices.getSize(1), - listOffsetToUserIndex_); - - // Copy back to GPU, since the input to this function is on the - // GPU - outIndices.copyFrom(hostOutIndices, stream); - } -} +// +// +// // If the GPU isn't storing indices (they are on the CPU side), we +// // need to perform the re-mapping here +// // FIXME: we might ultimately be calling this function with inputs +// // from the CPU, these are unnecessary copies +// if (indicesOptions_ == INDICES_CPU) { +// HostTensor hostOutIndices(outIndices, stream); +// +// ivfOffsetToUserIndex( +// hostOutIndices.data(), +// numLists_, +// hostOutIndices.getSize(0), +// hostOutIndices.getSize(1), +// listOffsetToUserIndex_); +// +// // Copy back to GPU, since the input to this function is on the +// // GPU +// outIndices.copyFrom(hostOutIndices, stream); +// } +//} } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh index 16754a7fe8..05ca705588 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh @@ -34,6 +34,18 @@ class RaftIVFFlat : public IVFFlat { ~RaftIVFFlat() override; + + /// Find the approximate k nearest neigbors for `queries` against + /// our database + void search( + Index* coarseQuantizer, + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) override; + + protected: /// Returns the number of bytes in which an IVF list containing numVecs /// vectors is encoded on the device. Note that due to padding this is not @@ -65,17 +77,17 @@ class RaftIVFFlat : public IVFFlat { Tensor& listOffset, cudaStream_t stream) override; - /// Shared IVF search implementation, used by both search and - /// searchPreassigned - void searchImpl_( - Tensor& queries, - Tensor& coarseDistances, - Tensor& coarseIndices, - Tensor& ivfCentroids, - int k, - Tensor& outDistances, - Tensor& outIndices, - bool storePairs); +// /// Shared IVF search implementation, used by both search and +// /// searchPreassigned +// void searchImpl_( +// Tensor& queries, +// Tensor& coarseDistances, +// Tensor& coarseIndices, +// Tensor& ivfCentroids, +// int k, +// Tensor& outDistances, +// Tensor& outIndices, +// bool storePairs); protected: std::optional> raft_knn_index{std::nullopt}; From af6d1e9c998883eb60d7d0efba90a739255fde9f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 26 Oct 2022 18:46:33 -0400 Subject: [PATCH 35/87] Adding the append vectors to raft index IVF flat. --- faiss/gpu/GpuIndexIVF.cu | 28 ++++-- faiss/gpu/impl/IVFBase.cu | 1 + faiss/gpu/impl/raft/RaftIVFFlat.cu | 148 ++++------------------------ faiss/gpu/impl/raft/RaftIVFFlat.cuh | 52 ++-------- 4 files changed, 47 insertions(+), 182 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index ff4eb974b9..bfd5f16c8d 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -5,6 +5,11 @@ * LICENSE file in the root directory of this source tree. */ +#include +#include +#include +#include + #include #include #include @@ -460,17 +465,24 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) { } if(config_.use_raft) { - /** - * TODO: Plug in clustering logic here. - * - * Essentially what we need here is to use `x` as the training data set - * to train the k-means centroids and add them to the quantizer - * implementation. - */ + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_n_iters = 100; + auto raft_index = raft::neighbors::ivf_flat::build( + raft_handle, raft_idx_params, x, n, (Index::idx_t)d); + raft_handle.sync_stream(); + // TODO: Validate this is all we need to do + quantizer->reset(); + quantizer->train(nlist, raft_index.centers().data_handle()); + quantizer->add(nlist, raft_index.centers().data_handle()); } else { // leverage the CPU-side k-means code, which works for the GPU @@ -479,9 +491,9 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) { Clustering clus(this->d, nlist, this->cp); clus.verbose = verbose; clus.train(n, x, *quantizer); - quantizer->is_trained = true; } + quantizer->is_trained = true; FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu index b3995e106d..0e6cbc85f9 100644 --- a/faiss/gpu/impl/IVFBase.cu +++ b/faiss/gpu/impl/IVFBase.cu @@ -582,6 +582,7 @@ void IVFBase::searchCoarseQuantizer_( } } +// TODO: Is it best to plug in here? int IVFBase::addVectors( Index* coarseQuantizer, Tensor& vecs, diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu index 1f01f74242..f12d7a7c7d 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cu +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cu @@ -84,8 +84,6 @@ void RaftIVFFlat::search( raft::spatial::knn::ivf_flat::search_params pams; pams.n_probes = nprobe; - // TODO: - auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); @@ -96,140 +94,28 @@ void RaftIVFFlat::search( raft_handle.sync_stream(); } +/// Classify and encode/add vectors to our IVF lists. +/// The input data must be on our current device. +/// Returns the number of vectors successfully added. Vectors may +/// not be able to be added because they contain NaNs. +int RaftIVFFlat::addVectors( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& indices) { -size_t RaftIVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { - if (interleavedLayout_) { - // bits per scalar code - int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; - - // bytes to encode a block of 32 vectors (single dimension) - int bytesPerDimBlock = bits * 32 / 8; - - // bytes to fully encode 32 vectors - int bytesPerBlock = bytesPerDimBlock * dim_; - - // number of blocks of 32 vectors we have - int numBlocks = utils::divUp(numVecs, 32); - - // total size to encode numVecs - return bytesPerBlock * numBlocks; - } else { - size_t sizePerVector = - (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); - - return (size_t)numVecs * sizePerVector; - } -} - -size_t RaftIVFFlat::getCpuVectorsEncodingSize_(int numVecs) const { - size_t sizePerVector = - (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_); - - return (size_t)numVecs * sizePerVector; -} - -std::vector RaftIVFFlat::translateCodesToGpu_( - std::vector codes, - size_t numVecs) const { - if (!interleavedLayout_) { - // same format - return codes; - } - - int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - - auto up = - unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); - return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); -} - -std::vector RaftIVFFlat::translateCodesFromGpu_( - std::vector codes, - size_t numVecs) const { - if (!interleavedLayout_) { - // same format - return codes; - } - - int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - - auto up = unpackInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); - return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); -} + auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); + auto inds_view = raft::make_device_vector_view(indices.data(), (Index::idx_t )indices.getSize(0)); + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); -void RaftIVFFlat::appendVectors_( - Tensor& vecs, - Tensor& ivfCentroidResiduals, - Tensor& indices, - Tensor& uniqueLists, - Tensor& vectorsByUniqueList, - Tensor& uniqueListVectorStart, - Tensor& uniqueListStartOffset, - Tensor& listIds, - Tensor& listOffset, - cudaStream_t stream) { - // - // Append the new encodings - // - - // TODO: Fill in this logic here + // TODO: We probably don't want to ignore the coarse quantizer here + raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( + raft_handle, + raft_knn_index.value(), + vecs_view, + std::make_optional>(inds_view))); } -//void RaftIVFFlat::searchImpl_( -// Tensor& queries, -// Tensor& coarseDistances, -// Tensor& coarseIndices, -// Tensor& ivfCentroids, -// int k, -// Tensor& outDistances, -// Tensor& outIndices, -// bool storePairs) { -// FAISS_ASSERT(storePairs == false); -// -// auto stream = resources_->getDefaultStreamCurrentDevice(); -// -// // TODO: Fill in this logic here. -// -//// // Device is already set in GpuIndex::search -//// FAISS_ASSERT(raft_knn_index.has_value()); -//// FAISS_ASSERT(n > 0); -//// FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); -//// -// raft::spatial::knn::ivf_flat::search_params pams; -// pams.n_probes = nprobe; -// raft::spatial::knn::ivf_flat::search( -// raft_handle, -// pams, -// *raft_knn_index, -// const_cast(x), -// static_cast(n), -// static_cast(k), -// labels, -// distances); -// -// raft_handle.sync_stream(); -// -// -// // If the GPU isn't storing indices (they are on the CPU side), we -// // need to perform the re-mapping here -// // FIXME: we might ultimately be calling this function with inputs -// // from the CPU, these are unnecessary copies -// if (indicesOptions_ == INDICES_CPU) { -// HostTensor hostOutIndices(outIndices, stream); -// -// ivfOffsetToUserIndex( -// hostOutIndices.data(), -// numLists_, -// hostOutIndices.getSize(0), -// hostOutIndices.getSize(1), -// listOffsetToUserIndex_); -// -// // Copy back to GPU, since the input to this function is on the -// // GPU -// outIndices.copyFrom(hostOutIndices, stream); -// } -//} } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/raft/RaftIVFFlat.cuh index 05ca705588..c2556c448f 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/raft/RaftIVFFlat.cuh @@ -8,7 +8,7 @@ #pragma once #include -#include +#include #include #include @@ -45,52 +45,18 @@ class RaftIVFFlat : public IVFFlat { Tensor& outDistances, Tensor& outIndices) override; - - protected: - /// Returns the number of bytes in which an IVF list containing numVecs - /// vectors is encoded on the device. Note that due to padding this is not - /// the same as the encoding size for a subset of vectors in an IVF list; - /// this is the size for an entire IVF list - size_t getGpuVectorsEncodingSize_(int numVecs) const override; - size_t getCpuVectorsEncodingSize_(int numVecs) const override; - - /// Translate to our preferred GPU encoding - std::vector translateCodesToGpu_( - std::vector codes, - size_t numVecs) const override; - - /// Translate from our preferred GPU encoding - std::vector translateCodesFromGpu_( - std::vector codes, - size_t numVecs) const override; - - /// Encode the vectors that we're adding and append to our IVF lists - void appendVectors_( + /// Classify and encode/add vectors to our IVF lists. + /// The input data must be on our current device. + /// Returns the number of vectors successfully added. Vectors may + /// not be able to be added because they contain NaNs. + int addVectors( + Index* coarseQuantizer, Tensor& vecs, - Tensor& ivfCentroidResiduals, - Tensor& indices, - Tensor& uniqueLists, - Tensor& vectorsByUniqueList, - Tensor& uniqueListVectorStart, - Tensor& uniqueListStartOffset, - Tensor& listIds, - Tensor& listOffset, - cudaStream_t stream) override; + Tensor& indices); -// /// Shared IVF search implementation, used by both search and -// /// searchPreassigned -// void searchImpl_( -// Tensor& queries, -// Tensor& coarseDistances, -// Tensor& coarseIndices, -// Tensor& ivfCentroids, -// int k, -// Tensor& outDistances, -// Tensor& outIndices, -// bool storePairs); protected: - std::optional> raft_knn_index{std::nullopt}; + std::optional> raft_knn_index{std::nullopt}; }; From 545b3d22f712921a3b43756cee35754a7883cbda Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 27 Oct 2022 11:56:59 -0400 Subject: [PATCH 36/87] Add ing flatindex for the fused l2 knn --- faiss/gpu/CMakeLists.txt | 12 +- faiss/gpu/GpuIndexFlat.cu | 38 +- faiss/gpu/GpuIndexFlat.h | 3 + faiss/gpu/GpuIndexIVFFlat.cu | 2 +- faiss/gpu/StandardGpuResources.h | 4 +- faiss/gpu/impl/FlatIndex.cuh | 2 +- faiss/gpu/impl/IVFBase.cuh | 14 +- faiss/gpu/impl/RaftFlatIndex.cu | 70 ++ faiss/gpu/impl/RaftFlatIndex.cuh | 42 ++ faiss/gpu/impl/RaftIVFFlat.cu | 232 ++++++ faiss/gpu/impl/{raft => }/RaftIVFFlat.cuh | 33 +- faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.cu | 2 +- faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.h | 0 faiss/gpu/impl/raft/RaftIVFFlat.cu | 121 --- faiss/gpu/impl/raft/RaftIndexIVFPQ.cu | 419 ----------- faiss/gpu/impl/raft/RaftIndexIVFPQ.h | 165 ---- faiss/gpu/impl/raft/RmmGpuResources.hpp | 656 ---------------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 1 + faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 22 +- faiss/gpu/test/TestRaftIndexIVFPQ.cpp | 704 ------------------ 20 files changed, 434 insertions(+), 2108 deletions(-) create mode 100644 faiss/gpu/impl/RaftFlatIndex.cu create mode 100644 faiss/gpu/impl/RaftFlatIndex.cuh create mode 100644 faiss/gpu/impl/RaftIVFFlat.cu rename faiss/gpu/impl/{raft => }/RaftIVFFlat.cuh (55%) rename faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.cu (99%) rename faiss/gpu/impl/{raft => }/RaftIndexIVFFlat.h (100%) delete mode 100644 faiss/gpu/impl/raft/RaftIVFFlat.cu delete mode 100644 faiss/gpu/impl/raft/RaftIndexIVFPQ.cu delete mode 100644 faiss/gpu/impl/raft/RaftIndexIVFPQ.h delete mode 100644 faiss/gpu/impl/raft/RmmGpuResources.hpp delete mode 100644 faiss/gpu/test/TestRaftIndexIVFPQ.cpp diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 8b373aecb8..0e82af813c 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -165,10 +165,14 @@ set(FAISS_GPU_HEADERS ) if(FAISS_ENABLE_RAFT) - list(APPEND FAISS_GPU_HEADERS impl/raft/RaftIndexIVFFlat.h impl/raft/RaftIndexIVFPQ.h - impl/raft/RaftIVFFlat.cuh) - list(APPEND FAISS_GPU_SRC impl/raft/RaftIndexIVFFlat.cu impl/raft/RaftIndexIVFPQ.cu - impl/raft/RaftIVFFlat.cu) + list(APPEND FAISS_GPU_HEADERS + impl/RaftIndexIVFFlat.h + impl/RaftFlatIndex.cuh + impl/RaftIVFFlat.cuh) + list(APPEND FAISS_GPU_SRC + impl/RaftIndexIVFFlat.cu + impl/RaftFlatIndex.cu + impl/RaftIVFFlat.cu) endif() # Export FAISS_GPU_HEADERS variable to parent scope. diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index b005f0eaf4..429eb64db7 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -67,11 +68,7 @@ GpuIndexFlat::GpuIndexFlat( this->is_trained = true; // Construct index - data_.reset(new FlatIndex( - resources_.get(), - dims, - flatConfig_.useFloat16, - config_.memorySpace)); + resetIndex_(dims); } GpuIndexFlat::GpuIndexFlat( @@ -86,15 +83,30 @@ GpuIndexFlat::GpuIndexFlat( this->is_trained = true; // Construct index - data_.reset(new FlatIndex( - resources_.get(), - dims, - flatConfig_.useFloat16, - config_.memorySpace)); + resetIndex_(dims); } GpuIndexFlat::~GpuIndexFlat() {} +void GpuIndexFlat::resetIndex_(int dims) { + + if(config_.use_raft) { + data_.reset(new RaftFlatIndex( + resources_.get(), + dims, + flatConfig_.useFloat16, + config_.memorySpace)); + + } else { + data_.reset(new FlatIndex( + resources_.get(), + dims, + flatConfig_.useFloat16, + config_.memorySpace)); + } +} + + void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) { DeviceScope scope(config_.device); @@ -109,11 +121,7 @@ void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) { (size_t)index->ntotal); data_.reset(); - data_.reset(new FlatIndex( - resources_.get(), - this->d, - flatConfig_.useFloat16, - config_.memorySpace)); + resetIndex_(this->d); // The index could be empty if (index->ntotal > 0) { diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h index 3cc56bc905..4982646159 100644 --- a/faiss/gpu/GpuIndexFlat.h +++ b/faiss/gpu/GpuIndexFlat.h @@ -116,6 +116,9 @@ class GpuIndexFlat : public GpuIndex { } protected: + + void resetIndex_(int dims); + /// Flat index does not require IDs as there is no storage available for /// them bool addImplRequiresIDs_() const override; diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 642058c4d4..033c7189c9 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include #include diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index e28d89a492..672f1b8339 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -98,7 +98,7 @@ class StandardGpuResourcesImpl : public GpuResources { cudaStream_t getAsyncCopyStream(int device) override; - private: + protected: /// Have GPU resources been initialized for this device yet? bool isInitialized(int device) const; @@ -106,7 +106,7 @@ class StandardGpuResourcesImpl : public GpuResources { /// memory size static size_t getDefaultTempMemForGPU(int device, size_t requested); - private: + protected: /// Set of currently outstanding memory allocations per device /// device -> (alloc request, allocated ptr) std::unordered_map> allocs_; diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh index bb66cd4b2b..56fbe609b9 100644 --- a/faiss/gpu/impl/FlatIndex.cuh +++ b/faiss/gpu/impl/FlatIndex.cuh @@ -80,7 +80,7 @@ class FlatIndex { /// Free all storage void reset(); - private: + protected: /// Collection of GPU resources that we use GpuResources* resources_; diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index 5fadfe7f70..e15c4c958d 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -45,7 +45,7 @@ class IVFBase { /// Clear out all inverted lists, but retain the coarse quantizer /// and the product quantizer info - void reset(); + virtual void reset(); /// Return the number of dimensions we are indexing int getDim() const; @@ -59,19 +59,19 @@ class IVFBase { /// For debugging purposes, return the list length of a particular /// list - int getListLength(int listId) const; + virtual int getListLength(int listId) const; /// Return the list indices of a particular list back to the CPU - std::vector getListIndices(int listId) const; + virtual std::vector getListIndices(int listId) const; /// Return the encoded vectors of a particular list back to the CPU - std::vector getListVectorData(int listId, bool gpuFormat) const; + virtual std::vector getListVectorData(int listId, bool gpuFormat) const; /// Copy all inverted lists from a CPU representation to ourselves - void copyInvertedListsFrom(const InvertedLists* ivf); + virtual void copyInvertedListsFrom(const InvertedLists* ivf); /// Copy all inverted lists from ourselves to a CPU representation - void copyInvertedListsTo(InvertedLists* ivf); + virtual void copyInvertedListsTo(InvertedLists* ivf); /// Update our coarse quantizer with this quantizer instance; may be a CPU /// or GPU quantizer @@ -81,7 +81,7 @@ class IVFBase { /// The input data must be on our current device. /// Returns the number of vectors successfully added. Vectors may /// not be able to be added because they contain NaNs. - int addVectors( + virtual int addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices); diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu new file mode 100644 index 0000000000..1a8369a1c3 --- /dev/null +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -0,0 +1,70 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +//#include +#include + +namespace faiss { +namespace gpu { + +RaftFlatIndex::RaftFlatIndex( + GpuResources* res, + int dim, + bool useFloat16, + MemorySpace space) + : FlatIndex(res, dim, useFloat16, space) {} + +void RaftFlatIndex::query( + Tensor& input, + int k, + faiss::MetricType metric, + float metricArg, + Tensor& outDistances, + Tensor& outIndices, + bool exactDistance) { + + // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used + if(k <= 64 && metric == MetricType::METRIC_L2 && + input.getStride(0) == 0 && vectors_.getStride(0) == 0) { + raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded : + raft::distance::DistanceType::L2Expanded; + + auto index = raft::make_device_matrix_view(vectors_.data(), vectors_.getSize(0), vectors_.getSize(1)); + auto search = raft::make_device_matrix_view(input.data(), input.getSize(0), input.getSize(1)); + auto inds = raft::make_device_matrix_view(outIndices.data(), outIndices.getSize(0), outIndices.getSize(1)); + auto dists = raft::make_device_matrix_view(outDistances.data(), outDistances.getSize(0), outDistances.getSize(1)); + +// raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance); + + // TODO: Expose the fused L2KNN through RAFT's public APIs + raft::spatial::knn::detail::fusedL2Knn(dim_, + inds.data_handle(), + dists.data_handle(), + index.data_handle(), + search.data_handle(), + index.extent(0), + search.extent(0), + k, + true, + true, + raft_handle.get_stream(), + distance); + + } else { + FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance); + } +} +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh new file mode 100644 index 0000000000..8a18053449 --- /dev/null +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -0,0 +1,42 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace faiss { +namespace gpu { + +class GpuResources; + +/// Holder of GPU resources for a particular flat index +/// Can be in either float16 or float32 mode. If float32, we only store +/// the vectors in float32. +/// If float16, we store the vectors in both float16 and float32, where float32 +/// data is possibly needed for certain residual operations +class RaftFlatIndex : public FlatIndex { + public: + RaftFlatIndex(GpuResources* res, int dim, bool useFloat16, MemorySpace space); + + void query( + Tensor& vecs, + int k, + faiss::MetricType metric, + float metricArg, + Tensor& outDistances, + Tensor& outIndices, + bool exactDistance); + +}; + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu new file mode 100644 index 0000000000..5563ca2eae --- /dev/null +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -0,0 +1,232 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { +namespace gpu { + +RaftIVFFlat::RaftIVFFlat( + GpuResources* res, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) + : IVFFlat(res, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space){} + +RaftIVFFlat::~RaftIVFFlat() {} + + +/// Find the approximate k nearest neighbors for `queries` against +/// our database +void RaftIVFFlat::search( + Index* coarseQuantizer, + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) { + printf("Inside RaftIVFFlat search()\n"); + + // TODO: We probably don't want to ignore the coarse quantizer here... + + std::uint32_t n = queries.getSize(0); + std::uint32_t cols = queries.getSize(1); + std::uint32_t k_ = k; + + // Device is already set in GpuIndex::search + FAISS_ASSERT(raft_knn_index.has_value()); + FAISS_ASSERT(n > 0); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); + + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + raft::neighbors::ivf_flat::search_params pams; + pams.n_probes = nprobe; + + auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); + auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); + auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); + raft::neighbors::ivf_flat::search( + raft_handle, *raft_knn_index, queries_view, + out_inds_view, out_dists_view, pams, k_); + + raft_handle.sync_stream(); +} + +/// Classify and encode/add vectors to our IVF lists. +/// The input data must be on our current device. +/// Returns the number of vectors successfully added. Vectors may +/// not be able to be added because they contain NaNs. +int RaftIVFFlat::addVectors( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& indices) { + printf("Inside RaftIVFFlat addVectors()\n"); + + auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); + auto inds_view = raft::make_device_vector_view(indices.data(), (Index::idx_t )indices.getSize(0)); + + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + // TODO: We probably don't want to ignore the coarse quantizer here + raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( + raft_handle, + raft_knn_index.value(), + vecs_view, + std::make_optional>(inds_view))); + return vecs.getSize(0); +} + +void RaftIVFFlat::reset() { + printf("Inside RaftIVFFlat reset()\n"); + raft_knn_index.reset(); +} + +int RaftIVFFlat::getListLength(int listId) const { + printf("Inside RaftIVFFlat getListLength\n"); + + FAISS_ASSERT(raft_knn_index.has_value()); + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + uint32_t size; + raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, + 1, raft_handle.get_stream()); + raft_handle.sync_stream(); + return int(size); +} + +/// Return the list indices of a par +/// ticular list back to the CPU +std::vector RaftIVFFlat::getListIndices(int listId) const { + + printf("Inside RaftIVFFlat getListIndices\n"); + + FAISS_ASSERT(raft_knn_index.has_value()); + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + Index::idx_t offset; + uint32_t size; + + raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream()); + raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); + raft_handle.sync_stream(); + + std::vector vec(size); + raft::copy( + vec.data(), + raft_knn_index.value().indices().data_handle() + offset, + size, + raft_handle.get_stream()); + return vec; +} + +/// Return the encoded vectors of a particular list back to the CPU +std::vector RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) const { + + printf("Inside RaftIVFFlat getListVectorData\n"); + + FAISS_ASSERT(raft_knn_index.has_value()); + const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + + std::cout << "Calling getListVectorData for " << listId << std::endl; + + using elem_t = decltype(raft_knn_index.value().data())::element_type; + size_t dim = raft_knn_index.value().dim(); + Index::idx_t offsets[2]; + raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream()); + + raft_handle.sync_stream(); + size_t byte_offset = offsets[0] * sizeof(elem_t) * dim; + // the interleaved block can be slightly larger than the list size (it's + // rounded up) + size_t byte_size = size_t(offsets[1]) * + sizeof(elem_t) * dim - + byte_offset; + std::vector vec(byte_size); + raft::copy( + vec.data(), + reinterpret_cast(raft_knn_index.value().data().data_handle()) + + byte_offset, + byte_size, + raft_handle.get_stream()); + return vec; +} + +/// Performs search when we are already given the IVF cells to look at +/// (GpuIndexIVF::search_preassigned implementation) +void RaftIVFFlat::searchPreassigned( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& ivfDistances, + Tensor& ivfAssignments, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs) { + printf("Inside RaftIVFFlat searchPreassigned\n"); + + // TODO: Fill this in! +} + +/// Copy all inverted lists from a CPU representation to ourselves +void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { + printf("Inside RaftIVFFlat copyInvertedListsFrom\n"); + + ivf->print_stats(); + + // TODO: Need to replicate copyInvertedListsFrom() in IVFBase.cu + // but populate a RAFT index. +} + +/// Copy all inverted lists from ourselves to a CPU representation +void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { + printf("Inside RaftIVFFlat copyInvertedListsTo\n"); + + // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu +} + + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh similarity index 55% rename from faiss/gpu/impl/raft/RaftIVFFlat.cuh rename to faiss/gpu/impl/RaftIVFFlat.cuh index c2556c448f..1078204270 100644 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -45,6 +45,18 @@ class RaftIVFFlat : public IVFFlat { Tensor& outDistances, Tensor& outIndices) override; + /// Performs search when we are already given the IVF cells to look at + /// (GpuIndexIVF::search_preassigned implementation) + void searchPreassigned( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& ivfDistances, + Tensor& ivfAssignments, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs) override; + /// Classify and encode/add vectors to our IVF lists. /// The input data must be on our current device. /// Returns the number of vectors successfully added. Vectors may @@ -52,8 +64,27 @@ class RaftIVFFlat : public IVFFlat { int addVectors( Index* coarseQuantizer, Tensor& vecs, - Tensor& indices); + Tensor& indices) override; + + /// Clear out all inverted lists, but retain the coarse quantizer + /// and the product quantizer info + void reset() override; + + /// For debugging purposes, return the list length of a particular + /// list + int getListLength(int listId) const override; + + /// Return the list indices of a particular list back to the CPU + std::vector getListIndices(int listId) const override; + + /// Return the encoded vectors of a particular list back to the CPU + std::vector getListVectorData(int listId, bool gpuFormat) const override; + + /// Copy all inverted lists from a CPU representation to ourselves + void copyInvertedListsFrom(const InvertedLists* ivf) override; + /// Copy all inverted lists from ourselves to a CPU representation + void copyInvertedListsTo(InvertedLists* ivf) override; protected: std::optional> raft_knn_index{std::nullopt}; diff --git a/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu similarity index 99% rename from faiss/gpu/impl/raft/RaftIndexIVFFlat.cu rename to faiss/gpu/impl/RaftIndexIVFFlat.cu index 03df717c69..88c5629e71 100644 --- a/faiss/gpu/impl/raft/RaftIndexIVFFlat.cu +++ b/faiss/gpu/impl/RaftIndexIVFFlat.cu @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/faiss/gpu/impl/raft/RaftIndexIVFFlat.h b/faiss/gpu/impl/RaftIndexIVFFlat.h similarity index 100% rename from faiss/gpu/impl/raft/RaftIndexIVFFlat.h rename to faiss/gpu/impl/RaftIndexIVFFlat.h diff --git a/faiss/gpu/impl/raft/RaftIVFFlat.cu b/faiss/gpu/impl/raft/RaftIVFFlat.cu deleted file mode 100644 index f12d7a7c7d..0000000000 --- a/faiss/gpu/impl/raft/RaftIVFFlat.cu +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace faiss { -namespace gpu { - -RaftIVFFlat::RaftIVFFlat( - GpuResources* res, - int dim, - int nlist, - faiss::MetricType metric, - float metricArg, - bool useResidual, - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space) - : IVFFlat(res, - dim, - nlist, - metric, - metricArg, - useResidual, - scalarQ, - interleavedLayout, - indicesOptions, - space){} - -RaftIVFFlat::~RaftIVFFlat() {} - - -/// Find the approximate k nearest neigbors for `queries` against -/// our database -void RaftIVFFlat::search( - Index* coarseQuantizer, - Tensor& queries, - int nprobe, - int k, - Tensor& outDistances, - Tensor& outIndices) { - - // TODO: We probably don't want to ignore the coarse quantizer here... - - std::uint32_t n = queries.getSize(0); - std::uint32_t cols = queries.getSize(1); - std::uint32_t k_ = k; - - // Device is already set in GpuIndex::search - FAISS_ASSERT(raft_knn_index.has_value()); - FAISS_ASSERT(n > 0); - FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); - - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); - raft::spatial::knn::ivf_flat::search_params pams; - pams.n_probes = nprobe; - - auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); - auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); - auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); - raft::spatial::knn::ivf_flat::search( - raft_handle, *raft_knn_index, queries_view, - out_inds_view, out_dists_view, pams, k_); - - raft_handle.sync_stream(); -} - -/// Classify and encode/add vectors to our IVF lists. -/// The input data must be on our current device. -/// Returns the number of vectors successfully added. Vectors may -/// not be able to be added because they contain NaNs. -int RaftIVFFlat::addVectors( - Index* coarseQuantizer, - Tensor& vecs, - Tensor& indices) { - - auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); - auto inds_view = raft::make_device_vector_view(indices.data(), (Index::idx_t )indices.getSize(0)); - - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); - - // TODO: We probably don't want to ignore the coarse quantizer here - raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( - raft_handle, - raft_knn_index.value(), - vecs_view, - std::make_optional>(inds_view))); -} - - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu b/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu deleted file mode 100644 index f30f34259f..0000000000 --- a/faiss/gpu/impl/raft/RaftIndexIVFPQ.cu +++ /dev/null @@ -1,419 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -namespace faiss { -namespace gpu { -/** - * GpuIndexIVFPQ( - GpuResourcesProvider* provider, - int dims, - int nlist, - int subQuantizers, - int bitsPerCode, - faiss::MetricType metric, - GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()) - * @param provider - * @param index - * @param config - */ -RaftIndexIVFPQ::RaftIndexIVFPQ( - GpuResourcesProvider* provider, - const faiss::IndexIVFPQ* index, - GpuIndexIVFPQConfig config) - : GpuIndexIVFPQ(provider, index, config), - pq(index->pq), - ivfpqConfig_(config), - usePrecomputedTables_(config.usePrecomputedTables), - subQuantizers_(0), - bitsPerCode_(0), - reserveMemoryVecs_(0) { - copyFrom(index); -} - -RaftIndexIVFPQ::RaftIndexIVFPQ( - GpuResourcesProvider* provider, - int dims, - int nlist, - int subQuantizers, - int bitsPerCode, - faiss::MetricType metric, - GpuIndexIVFPQConfig config) - : GpuIndexIVFPQ(provider, dims, nlist, subQuantizers, bitsPerCode, metric, config), - pq(dims, subQuantizers, bitsPerCode), - ivfpqConfig_(config), - usePrecomputedTables_(config.usePrecomputedTables), - subQuantizers_(subQuantizers), - bitsPerCode_(bitsPerCode), - reserveMemoryVecs_(0) { - verifySettings_(); - - // We haven't trained ourselves, so don't construct the PQ index yet - this->is_trained = false; -} - -RaftIndexIVFPQ::RaftIndexIVFPQ( - GpuResourcesProvider* provider, - Index *coarse_quantizer, - int dims, - int nlist, - int subQuantizers, - int bitsPerCode, - faiss::MetricType metric, - GpuIndexIVFPQConfig config) - : GpuIndexIVFPQ(provider, coarse_quantizer, dims, nlist, subQuantizers, bitsPerCode, metric, config), - pq(dims, subQuantizers, bitsPerCode), - ivfpqConfig_(config), - usePrecomputedTables_(config.usePrecomputedTables), - subQuantizers_(subQuantizers), - bitsPerCode_(bitsPerCode), - reserveMemoryVecs_(0) { - verifySettings_(); - - // We haven't trained ourselves, so don't construct the PQ index yet - this->is_trained = false; -} - -RaftIndexIVFPQ::~RaftIndexIVFPQ() {} - -void RaftIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { -// DeviceScope scope(config_.device); -// -// GpuIndexIVF::copyFrom(index); -// -// // Clear out our old data -// index_.reset(); -// -// pq = index->pq; -// subQuantizers_ = index->pq.M; -// bitsPerCode_ = index->pq.nbits; -// -// // We only support this -// FAISS_THROW_IF_NOT_MSG( -// ivfpqConfig_.interleavedLayout || index->pq.nbits == 8, -// "GPU: only pq.nbits == 8 is supported"); -// FAISS_THROW_IF_NOT_MSG( -// index->by_residual, "GPU: only by_residual = true is supported"); -// FAISS_THROW_IF_NOT_MSG( -// index->polysemous_ht == 0, "GPU: polysemous codes not supported"); -// -// verifySettings_(); -// -// // The other index might not be trained -// if (!index->is_trained) { -// // copied in GpuIndex::copyFrom -// FAISS_ASSERT(!is_trained); -// return; -// } -// -// // Copy our lists as well -// // The product quantizer must have data in it -// FAISS_ASSERT(index->pq.centroids.size() > 0); -// index_.reset(new IVFPQ( -// resources_.get(), -// index->metric_type, -// index->metric_arg, -// quantizer->getGpuData(), -// subQuantizers_, -// bitsPerCode_, -// ivfpqConfig_.useFloat16LookupTables, -// ivfpqConfig_.useMMCodeDistance, -// ivfpqConfig_.interleavedLayout, -// (float*)index->pq.centroids.data(), -// ivfpqConfig_.indicesOptions, -// config_.memorySpace)); -// // Doesn't make sense to reserve memory here -// index_->setPrecomputedCodes(usePrecomputedTables_); -// -// // Copy all of the IVF data -// index_->copyInvertedListsFrom(index->invlists); -} - -void RaftIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const { -// DeviceScope scope(config_.device); -// -// // We must have the indices in order to copy to ourselves -// FAISS_THROW_IF_NOT_MSG( -// ivfpqConfig_.indicesOptions != INDICES_IVF, -// "Cannot copy to CPU as GPU index doesn't retain " -// "indices (INDICES_IVF)"); -// -// GpuIndexIVF::copyTo(index); -// -// // -// // IndexIVFPQ information -// // -// index->by_residual = true; -// index->use_precomputed_table = 0; -// index->code_size = subQuantizers_; -// index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_); -// -// index->do_polysemous_training = false; -// index->polysemous_training = nullptr; -// -// index->scan_table_threshold = 0; -// index->max_codes = 0; -// index->polysemous_ht = 0; -// index->precomputed_table.clear(); -// -// auto ivf = new ArrayInvertedLists(nlist, index->code_size); -// index->replace_invlists(ivf, true); -// -// if (index_) { -// // Copy IVF lists -// index_->copyInvertedListsTo(ivf); -// -// // Copy PQ centroids -// auto devPQCentroids = index_->getPQCentroids(); -// index->pq.centroids.resize(devPQCentroids.numElements()); -// -// fromDevice( -// devPQCentroids, -// index->pq.centroids.data(), -// resources_->getDefaultStream(config_.device)); -// -// if (usePrecomputedTables_) { -// index->precompute_table(); -// } -// } -} - -void RaftIndexIVFPQ::reserveMemory(size_t numVecs) { - reserveMemoryVecs_ = numVecs; - if (index_) { - DeviceScope scope(config_.device); - index_->reserveMemory(numVecs); - } -} - -void RaftIndexIVFPQ::setPrecomputedCodes(bool enable) { - usePrecomputedTables_ = enable; - if (index_) { - DeviceScope scope(config_.device); - index_->setPrecomputedCodes(quantizer, enable); - } - - verifySettings_(); -} - -bool RaftIndexIVFPQ::getPrecomputedCodes() const { - return usePrecomputedTables_; -} - -int RaftIndexIVFPQ::getNumSubQuantizers() const { - return subQuantizers_; -} - -int RaftIndexIVFPQ::getBitsPerCode() const { - return bitsPerCode_; -} - -int RaftIndexIVFPQ::getCentroidsPerSubQuantizer() const { - return utils::pow2(bitsPerCode_); -} - -size_t RaftIndexIVFPQ::reclaimMemory() { - if (index_) { - DeviceScope scope(config_.device); - return index_->reclaimMemory(); - } - - return 0; -} - -void RaftIndexIVFPQ::reset() { - if (raft_knn_index.has_value()) { - raft_knn_index.reset(); - this->ntotal = 0; - } else { - FAISS_ASSERT(this->ntotal == 0); - } -} - -void RaftIndexIVFPQ::train(Index::idx_t n, const float* x) { - raft::common::nvtx::range fun_scope( - "RaftIndexIVFFlat::train (%ld)", n); - - std::cout << "Calling train() with " << n << " rows" << std::endl; - - uint32_t start = raft::curTimeMillis(); - if (this->is_trained) { - FAISS_ASSERT(raft_knn_index.has_value()); - return; - } - - raft::spatial::knn::ivf_pq::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_n_iters = 100; - - raft_knn_index.emplace( - raft::spatial::knn::ivf_pq::build(raft_handle, raft_idx_params, - const_cast(x), - n, (faiss::Index::idx_t)d)); - - raft_handle.sync_stream(); - uint32_t stop = raft::curTimeMillis(); - - std::cout << "train took " << (stop - start) << "ms. " << std::endl; - this->is_trained = true; -} - -void RaftIndexIVFPQ::addImpl_(int n, const float* x, const Index::idx_t* xids) { - // Device is already set in GpuIndex::add - FAISS_ASSERT(is_trained); - FAISS_ASSERT(n > 0); - - // but keep the ntotal based on the total number of vectors that we - // attempted to add - std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; - - raft_knn_index.emplace(raft::spatial::knn::ivf_pq::extend( - raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n)); - this->ntotal += n; - - ntotal += n; -} - -void RaftIndexIVFPQ::searchImpl_( - int n, - const float* x, - int k, - float* distances, - Index::idx_t* labels, - const SearchParameters *params) const { - // Device is already set in GpuIndex::search - FAISS_ASSERT(index_); - FAISS_ASSERT(n > 0); - FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); - - raft::common::nvtx::range fun_scope( - "RaftIndexIVFFlat::searchImpl_ (%ld)", n); - - // Device is already set in GpuIndex::search - FAISS_ASSERT(raft_knn_index.has_value()); - FAISS_ASSERT(n > 0); - FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); - - raft::spatial::knn::ivf_pq::search_params pams; - pams.n_probes = nprobe; - raft::spatial::knn::ivf_pq::search( - raft_handle, - pams, - *raft_knn_index, - const_cast(x), - static_cast(n), - static_cast(k), - labels, - distances); - - raft_handle.sync_stream(); -} - -int RaftIndexIVFPQ::getListLength(int listId) const { - FAISS_ASSERT(index_); - DeviceScope scope(config_.device); - - return index_->getListLength(listId); -} - -std::vector RaftIndexIVFPQ::getListVectorData( - int listId, - bool gpuFormat) const { - FAISS_ASSERT(index_); - DeviceScope scope(config_.device); - - return index_->getListVectorData(listId, gpuFormat); -} - -std::vector RaftIndexIVFPQ::getListIndices(int listId) const { - FAISS_ASSERT(index_); - DeviceScope scope(config_.device); - - return index_->getListIndices(listId); -} - -void RaftIndexIVFPQ::verifySettings_() const { - // Our implementation has these restrictions: - - // Must have some number of lists - FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0"); - - // up to a single byte per code - if (ivfpqConfig_.interleavedLayout) { - FAISS_THROW_IF_NOT_FMT( - bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 || - bitsPerCode_ == 8, - "Bits per code must be between 4, 5, 6 or 8 (passed %d)", - bitsPerCode_); - - } else { - FAISS_THROW_IF_NOT_FMT( - bitsPerCode_ == 8, - "Bits per code must be 8 (passed %d)", - bitsPerCode_); - } - - // Sub-quantizers must evenly divide dimensions available - FAISS_THROW_IF_NOT_FMT( - this->d % subQuantizers_ == 0, - "Number of sub-quantizers (%d) must be an " - "even divisor of the number of dimensions (%d)", - subQuantizers_, - this->d); - - // The number of bytes per encoded vector must be one we support - FAISS_THROW_IF_NOT_FMT( - ivfpqConfig_.interleavedLayout || - IVFPQ::isSupportedPQCodeLength(subQuantizers_), - "Number of bytes per encoded vector / sub-quantizers (%d) " - "is not supported", - subQuantizers_); - - // We must have enough shared memory on the current device to store - // our lookup distances - int lookupTableSize = sizeof(float); - if (ivfpqConfig_.useFloat16LookupTables) { - lookupTableSize = sizeof(half); - } - - // 64 bytes per code is only supported with usage of float16, at 2^8 - // codes per subquantizer - size_t requiredSmemSize = - lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_); - size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device); - - FAISS_THROW_IF_NOT_FMT( - requiredSmemSize <= getMaxSharedMemPerBlock(config_.device), - "Device %d has %zu bytes of shared memory, while " - "%d bits per code and %d sub-quantizers requires %zu " - "bytes. Consider useFloat16LookupTables and/or " - "reduce parameters", - config_.device, - smemPerBlock, - bitsPerCode_, - subQuantizers_, - requiredSmemSize); -} - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/impl/raft/RaftIndexIVFPQ.h b/faiss/gpu/impl/raft/RaftIndexIVFPQ.h deleted file mode 100644 index e7f1b7515c..0000000000 --- a/faiss/gpu/impl/raft/RaftIndexIVFPQ.h +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include -#include - -#include -#include - -namespace faiss { -struct IndexIVFPQ; -} - -namespace faiss { -namespace gpu { - -class GpuIndexFlat; -class IVFPQ; - -/// RAFT IVFPQ index for the GPU -class RaftIndexIVFPQ : public GpuIndexIVFPQ { - public: - /// Construct from a pre-existing faiss::IndexIVFPQ instance, copying - /// data over to the given GPU, if the input index is trained. - RaftIndexIVFPQ( - GpuResourcesProvider* provider, - const faiss::IndexIVFPQ* index, - GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); - - /// Construct an empty index - RaftIndexIVFPQ( - GpuResourcesProvider* provider, - int dims, - int nlist, - int subQuantizers, - int bitsPerCode, - faiss::MetricType metric, - GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); - - - /// Construct an empty index - RaftIndexIVFPQ( - GpuResourcesProvider* provider, - Index *coarse_quantizer, - int dims, - int nlist, - int subQuantizers, - int bitsPerCode, - faiss::MetricType metric, - GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig()); - - ~RaftIndexIVFPQ() override; - - /// Reserve space on the GPU for the inverted lists for `num` - /// vectors, assumed equally distributed among - - /// Initialize ourselves from the given CPU index; will overwrite - /// all data in ourselves - void copyFrom(const faiss::IndexIVFPQ* index); - - /// Copy ourselves to the given CPU index; will overwrite all data - /// in the index instance - void copyTo(faiss::IndexIVFPQ* index) const; - - /// Reserve GPU memory in our inverted lists for this number of vectors - void reserveMemory(size_t numVecs); - - /// Enable or disable pre-computed codes - void setPrecomputedCodes(bool enable); - - /// Are pre-computed codes enabled? - bool getPrecomputedCodes() const; - - /// Return the number of sub-quantizers we are using - int getNumSubQuantizers() const; - - /// Return the number of bits per PQ code - int getBitsPerCode() const; - - /// Return the number of centroids per PQ code (2^bits per code) - int getCentroidsPerSubQuantizer() const; - - /// After adding vectors, one can call this to reclaim device memory - /// to exactly the amount needed. Returns space reclaimed in bytes - size_t reclaimMemory(); - - /// Clears out all inverted lists, but retains the coarse and - /// product centroid information - void reset() override; - - /// Trains the coarse and product quantizer based on the given vector data - void train(Index::idx_t n, const float* x) override; - - /// Returns the number of vectors present in a particular inverted list - int getListLength(int listId) const override; - - /// Return the encoded vector data contained in a particular inverted list, - /// for debugging purposes. - /// If gpuFormat is true, the data is returned as it is encoded in the - /// GPU-side representation. - /// Otherwise, it is converted to the CPU format. - /// compliant format, while the native GPU format may differ. - std::vector getListVectorData(int listId, bool gpuFormat = false) - const override; - - /// Return the vector indices contained in a particular inverted list, for - /// debugging purposes. - std::vector getListIndices(int listId) const override; - - public: - /// Like the CPU version, we expose a publically-visible ProductQuantizer - /// for manipulation - ProductQuantizer pq; - - protected: - /// Called from GpuIndex for add/add_with_ids - void addImpl_(int n, const float* x, const Index::idx_t* ids) override; - - /// Called from GpuIndex for search - void searchImpl_( - int n, - const float* x, - int k, - float* distances, - Index::idx_t* labels, - const SearchParameters *params) const override; - - /// Throws errors if configuration settings are improper - void verifySettings_() const; - - protected: - /// Our configuration options that we were initialized with - const GpuIndexIVFPQConfig ivfpqConfig_; - - /// Runtime override: whether or not we use precomputed tables - bool usePrecomputedTables_; - - /// Number of sub-quantizers per encoded vector - int subQuantizers_; - - /// Bits per sub-quantizer code - int bitsPerCode_; - - /// Desired inverted list memory reservation - size_t reserveMemoryVecs_; - - /// The product quantizer instance that we own; contains the - /// inverted lists - std::unique_ptr index_; - - const raft::handle_t raft_handle; - std::optional> raft_knn_index{std::nullopt}; -}; - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/impl/raft/RmmGpuResources.hpp b/faiss/gpu/impl/raft/RmmGpuResources.hpp deleted file mode 100644 index c22c722a35..0000000000 --- a/faiss/gpu/impl/raft/RmmGpuResources.hpp +++ /dev/null @@ -1,656 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -/* -This code contains unnecessary code duplication. These could be deleted -once the relevant changes would be made on the FAISS side. Indeed most of -the logic in the below code is similar to FAISS's standard implementation -and should thus be inherited instead of duplicated. This FAISS's issue -once solved should allow the removal of the unnecessary duplicates -in this file : https://github.com/facebookresearch/faiss/issues/2097 -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace faiss { -namespace gpu { - -namespace { - -// How many streams per device we allocate by default (for multi-streaming) -constexpr int kNumStreams = 2; - -// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default -constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024; - -// Default temporary memory allocation for <= 4 GiB memory GPUs -constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024; - -// Default temporary memory allocation for <= 8 GiB memory GPUs -constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024; - -// Maximum temporary memory allocation for all GPUs -constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024; - -std::string allocsToString(const std::unordered_map& map) -{ - // Produce a sorted list of all outstanding allocations by type - std::unordered_map> stats; - - for (auto& entry : map) { - auto& a = entry.second; - - auto it = stats.find(a.type); - if (it != stats.end()) { - stats[a.type].first++; - stats[a.type].second += a.size; - } else { - stats[a.type] = std::make_pair(1, a.size); - } - } - - std::stringstream ss; - for (auto& entry : stats) { - ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first - << " allocations, " << entry.second.second << " bytes\n"; - } - - return ss.str(); -} - -} // namespace - -/// RMM implementation of the GpuResources object that provides for a -/// temporary memory manager -class RmmGpuResourcesImpl : public GpuResources { - public: - RmmGpuResourcesImpl() - : pinnedMemAlloc_(nullptr), - pinnedMemAllocSize_(0), - // let the adjustment function determine the memory size for us by passing - // in a huge value that will then be adjusted - tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits::max())), - pinnedMemSize_(kDefaultPinnedMemoryAllocation), - allocLogging_(false), - cmr(new rmm::mr::cuda_memory_resource), - mmr(new rmm::mr::managed_memory_resource), - pmr(new rmm::mr::pinned_memory_resource){}; - - ~RmmGpuResourcesImpl() - { - // The temporary memory allocator has allocated memory through us, so clean - // that up before we finish fully de-initializing ourselves - tempMemory_.clear(); - - // Make sure all allocations have been freed - bool allocError = false; - - for (auto& entry : allocs_) { - auto& map = entry.second; - - if (!map.empty()) { - std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n" - << "Device " << entry.first << " outstanding allocations:\n"; - std::cerr << allocsToString(map); - allocError = true; - } - } - - FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up"); - - for (auto& entry : defaultStreams_) { - DeviceScope scope(entry.first); - - // We created these streams, so are responsible for destroying them - CUDA_VERIFY(cudaStreamDestroy(entry.second)); - } - - for (auto& entry : alternateStreams_) { - DeviceScope scope(entry.first); - - for (auto stream : entry.second) { - CUDA_VERIFY(cudaStreamDestroy(stream)); - } - } - - for (auto& entry : asyncCopyStreams_) { - DeviceScope scope(entry.first); - - CUDA_VERIFY(cudaStreamDestroy(entry.second)); - } - - for (auto& entry : blasHandles_) { - DeviceScope scope(entry.first); - - auto blasStatus = cublasDestroy(entry.second); - FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); - } - - if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); } - }; - - /// Disable allocation of temporary memory; all temporary memory - /// requests will call cudaMalloc / cudaFree at the point of use - void noTempMemory() { setTempMemory(0); }; - - /// Specify that we wish to use a certain fixed size of memory on - /// all devices as temporary memory. This is the upper bound for the GPU - /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; - /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. - /// To avoid any temporary memory allocation, pass 0. - void setTempMemory(size_t size) - { - if (tempMemSize_ != size) { - // adjust based on general limits - tempMemSize_ = getDefaultTempMemForGPU(-1, size); - - // We need to re-initialize memory resources for all current devices that - // have been initialized. - // This should be safe to do, even if we are currently running work, because - // the cudaFree call that this implies will force-synchronize all GPUs with - // the CPU - for (auto& p : tempMemory_) { - int device = p.first; - // Free the existing memory first - p.second.reset(); - - // Allocate new - p.second = std::unique_ptr( - new StackDeviceMemory(this, - p.first, - // adjust for this specific device - getDefaultTempMemForGPU(device, tempMemSize_))); - } - } - }; - - /// Set amount of pinned memory to allocate, for async GPU <-> CPU - /// transfers - void setPinnedMemory(size_t size) - { - // Should not call this after devices have been initialized - FAISS_ASSERT(defaultStreams_.size() == 0); - FAISS_ASSERT(!pinnedMemAlloc_); - - pinnedMemSize_ = size; - }; - - /// Called to change the stream for work ordering. We do not own `stream`; - /// i.e., it will not be destroyed when the GpuResources object gets cleaned - /// up. - /// We are guaranteed that all Faiss GPU work is ordered with respect to - /// this stream upon exit from an index or other Faiss GPU call. - void setDefaultStream(int device, cudaStream_t stream) - { - if (isInitialized(device)) { - // A new series of calls may not be ordered with what was the previous - // stream, so if the stream being specified is different, then we need to - // ensure ordering between the two (new stream waits on old). - auto it = userDefaultStreams_.find(device); - cudaStream_t prevStream = nullptr; - - if (it != userDefaultStreams_.end()) { - prevStream = it->second; - } else { - FAISS_ASSERT(defaultStreams_.count(device)); - prevStream = defaultStreams_[device]; - } - - if (prevStream != stream) { streamWait({stream}, {prevStream}); } - } - - userDefaultStreams_[device] = stream; - }; - - /// Revert the default stream to the original stream managed by this resources - /// object, in case someone called `setDefaultStream`. - void revertDefaultStream(int device) - { - if (isInitialized(device)) { - auto it = userDefaultStreams_.find(device); - - if (it != userDefaultStreams_.end()) { - // There was a user stream set that we need to synchronize against - cudaStream_t prevStream = userDefaultStreams_[device]; - - FAISS_ASSERT(defaultStreams_.count(device)); - cudaStream_t newStream = defaultStreams_[device]; - - streamWait({newStream}, {prevStream}); - } - } - - userDefaultStreams_.erase(device); - }; - - /// Returns the stream for the given device on which all Faiss GPU work is - /// ordered. - /// We are guaranteed that all Faiss GPU work is ordered with respect to - /// this stream upon exit from an index or other Faiss GPU call. - cudaStream_t getDefaultStream(int device) - { - initializeForDevice(device); - - auto it = userDefaultStreams_.find(device); - if (it != userDefaultStreams_.end()) { - // There is a user override stream set - return it->second; - } - - // Otherwise, our base default stream - return defaultStreams_[device]; - }; - - /// Called to change the work ordering streams to the null stream - /// for all devices - void setDefaultNullStreamAllDevices() - { - for (int dev = 0; dev < getNumDevices(); ++dev) { - setDefaultStream(dev, nullptr); - } - }; - - /// If enabled, will print every GPU memory allocation and deallocation to - /// standard output - void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; }; - - public: - /// Internal system calls - - /// Initialize resources for this device - void initializeForDevice(int device) - { - if (isInitialized(device)) { return; } - - // If this is the first device that we're initializing, create our - // pinned memory allocation - if (defaultStreams_.empty() && pinnedMemSize_ > 0) { - pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_); - pinnedMemAllocSize_ = pinnedMemSize_; - } - - FAISS_ASSERT(device < getNumDevices()); - DeviceScope scope(device); - - // Make sure that device properties for all devices are cached - auto& prop = getDeviceProperties(device); - - // Also check to make sure we meet our minimum compute capability (3.0) - FAISS_ASSERT_FMT(prop.major >= 3, - "Device id %d with CC %d.%d not supported, " - "need 3.0+ compute capability", - device, - prop.major, - prop.minor); - - // Create streams - cudaStream_t defaultStream = 0; - CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking)); - - defaultStreams_[device] = defaultStream; - - cudaStream_t asyncCopyStream = 0; - CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking)); - - asyncCopyStreams_[device] = asyncCopyStream; - - std::vector deviceStreams; - for (int j = 0; j < kNumStreams; ++j) { - cudaStream_t stream = 0; - CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - deviceStreams.push_back(stream); - } - - alternateStreams_[device] = std::move(deviceStreams); - - // Create cuBLAS handle - - // TODO: We need to be able to use this cublas handle within the raft handle - cublasHandle_t blasHandle = 0; - auto blasStatus = cublasCreate(&blasHandle); - FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS); - blasHandles_[device] = blasHandle; - - // For CUDA 10 on V100, enabling tensor core usage would enable automatic - // rounding down of inputs to f16 (though accumulate in f32) which results in - // unacceptable loss of precision in general. - // For CUDA 11 / A100, only enable tensor core support if it doesn't result in - // a loss of precision. -#if CUDA_VERSION >= 11000 - cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); -#endif - - FAISS_ASSERT(allocs_.count(device) == 0); - allocs_[device] = std::unordered_map(); - - FAISS_ASSERT(tempMemory_.count(device) == 0); - auto mem = std::unique_ptr( - new StackDeviceMemory(this, - device, - // adjust for this specific device - getDefaultTempMemForGPU(device, tempMemSize_))); - - tempMemory_.emplace(device, std::move(mem)); - }; - - cublasHandle_t getBlasHandle(int device) - { - initializeForDevice(device); - return blasHandles_[device]; - }; - - std::vector getAlternateStreams(int device) - { - initializeForDevice(device); - return alternateStreams_[device]; - }; - - /// Allocate non-temporary GPU memory - void* allocMemory(const AllocRequest& req) - { - initializeForDevice(req.device); - - // We don't allocate a placeholder for zero-sized allocations - if (req.size == 0) { return nullptr; } - - // Make sure that the allocation is a multiple of 16 bytes for alignment - // purposes - auto adjReq = req; - adjReq.size = utils::roundUp(adjReq.size, (size_t)16); - - void* p = nullptr; - - if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; } - - if (adjReq.space == MemorySpace::Temporary) { - // If we don't have enough space in our temporary memory manager, we need - // to allocate this request separately - auto& tempMem = tempMemory_[adjReq.device]; - - if (adjReq.size > tempMem->getSizeAvailable()) { - // We need to allocate this ourselves - AllocRequest newReq = adjReq; - newReq.space = MemorySpace::Device; - newReq.type = AllocType::TemporaryMemoryOverflow; - - return allocMemory(newReq); - } - - // Otherwise, we can handle this locally - p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size); - - } else if (adjReq.space == MemorySpace::Device) { - p = cmr->allocate(adjReq.size, adjReq.stream); - } else if (adjReq.space == MemorySpace::Unified) { - p = mmr->allocate(adjReq.size, adjReq.stream); - } else { - FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space); - } - - allocs_[adjReq.device][p] = adjReq; - - return p; - }; - - /// Returns a previous allocation - void deallocMemory(int device, void* p) - { - FAISS_ASSERT(isInitialized(device)); - - if (!p) { return; } - - auto& a = allocs_[device]; - auto it = a.find(p); - FAISS_ASSERT(it != a.end()); - - auto& req = it->second; - - if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; } - - if (req.space == MemorySpace::Temporary) { - tempMemory_[device]->deallocMemory(device, req.stream, req.size, p); - } else if (req.space == MemorySpace::Device) { - cmr->deallocate(p, req.size, req.stream); - } else if (req.space == MemorySpace::Unified) { - mmr->deallocate(p, req.size, req.stream); - } else { - FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space); - } - - a.erase(it); - }; - - size_t getTempMemoryAvailable(int device) const - { - FAISS_ASSERT(isInitialized(device)); - - auto it = tempMemory_.find(device); - FAISS_ASSERT(it != tempMemory_.end()); - - return it->second->getSizeAvailable(); - }; - - /// Export a description of memory used for Python - std::map>> getMemoryInfo() const - { - using AT = std::map>; - - std::map out; - - for (auto& entry : allocs_) { - AT outDevice; - - for (auto& a : entry.second) { - auto& v = outDevice[allocTypeToString(a.second.type)]; - v.first++; - v.second += a.second.size; - } - - out[entry.first] = std::move(outDevice); - } - - return out; - }; - - std::pair getPinnedMemory() - { - return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_); - }; - - cudaStream_t getAsyncCopyStream(int device) - { - initializeForDevice(device); - return asyncCopyStreams_[device]; - }; - - private: - /// Have GPU resources been initialized for this device yet? - bool isInitialized(int device) const - { - // Use default streams as a marker for whether or not a certain - // device has been initialized - return defaultStreams_.count(device) != 0; - }; - - raft::handle_t &getRaftHandle(int device) { - initializeForDevice(device); - - auto it = raftHandles_.find(device); - if (it != raftHandles_.end()) { - // There is a user override handle set - return it->second; - } - - // Otherwise, our base default handle - return raftHandles_[device]; - } - - /// Adjust the default temporary memory allocation based on the total GPU - /// memory size - static size_t getDefaultTempMemForGPU(int device, size_t requested) - { - auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem - : std::numeric_limits::max(); - - if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) { - // If the GPU has <= 4 GiB of memory, reserve 512 MiB - - if (requested > k4GiBTempMem) { return k4GiBTempMem; } - } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) { - // If the GPU has <= 8 GiB of memory, reserve 1 GiB - - if (requested > k8GiBTempMem) { return k8GiBTempMem; } - } else { - // Never use more than 1.5 GiB - if (requested > kMaxTempMem) { return kMaxTempMem; } - } - - // use whatever lower limit the user requested - return requested; - }; - - private: - /// Set of currently outstanding memory allocations per device - /// device -> (alloc request, allocated ptr) - std::unordered_map> allocs_; - - /// Temporary memory provider, per each device - std::unordered_map> tempMemory_; - - /// Our default stream that work is ordered on, one per each device - std::unordered_map defaultStreams_; - - /// This contains particular streams as set by the user for - /// ordering, if any - std::unordered_map userDefaultStreams_; - - /// Other streams we can use, per each device - std::unordered_map> alternateStreams_; - - /// Async copy stream to use for GPU <-> CPU pinned memory copies - std::unordered_map asyncCopyStreams_; - - /// cuBLAS handle for each device - std::unordered_map blasHandles_; - - /// Pinned memory allocation for use with this GPU - void* pinnedMemAlloc_; - size_t pinnedMemAllocSize_; - - /// Another option is to use a specified amount of memory on all - /// devices - size_t tempMemSize_; - - /// Amount of pinned memory we should allocate - size_t pinnedMemSize_; - - /// Whether or not we log every GPU memory allocation and deallocation - bool allocLogging_; - - // cuda_memory_resource - std::unique_ptr cmr; - - // managed_memory_resource - std::unique_ptr mmr; - - // pinned_memory_resource - std::unique_ptr pmr; - - /// Our raft handle that maintains additional library resources, one per each device - std::unordered_map raftHandles_; - -}; - -/// Default implementation of GpuResources that allocates a cuBLAS -/// stream and 2 streams for use, as well as temporary memory. -/// Internally, the Faiss GPU code uses the instance managed by getResources, -/// but this is the user-facing object that is internally reference counted. -class RmmGpuResources : public GpuResourcesProvider { - public: - RmmGpuResources() : res_(new RmmGpuResourcesImpl){}; - - ~RmmGpuResources(){}; - - std::shared_ptr getResources() { return res_; }; - - /// Disable allocation of temporary memory; all temporary memory - /// requests will call cudaMalloc / cudaFree at the point of use - void noTempMemory() { res_->noTempMemory(); }; - - /// Specify that we wish to use a certain fixed size of memory on - /// all devices as temporary memory. This is the upper bound for the GPU - /// memory that we will reserve. We will never go above 1.5 GiB on any GPU; - /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that. - /// To avoid any temporary memory allocation, pass 0. - void setTempMemory(size_t size) { res_->setTempMemory(size); }; - - /// Set amount of pinned memory to allocate, for async GPU <-> CPU - /// transfers - void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); }; - - /// Called to change the stream for work ordering. We do not own `stream`; - /// i.e., it will not be destroyed when the GpuResources object gets cleaned - /// up. - /// We are guaranteed that all Faiss GPU work is ordered with respect to - /// this stream upon exit from an index or other Faiss GPU call. - void setDefaultStream(int device, cudaStream_t stream) - { - res_->setDefaultStream(device, stream); - }; - - /// Revert the default stream to the original stream managed by this resources - /// object, in case someone called `setDefaultStream`. - void revertDefaultStream(int device) { res_->revertDefaultStream(device); }; - - /// Called to change the work ordering streams to the null stream - /// for all devices - void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); }; - - /// Export a description of memory used for Python - std::map>> getMemoryInfo() const - { - return res_->getMemoryInfo(); - }; - - /// Returns the current default stream - cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); }; - - /// Returns the current amount of temp memory available - size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); }; - - /// Synchronize our default stream with the CPU - void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); }; - - /// If enabled, will print every GPU memory allocation and deallocation to - /// standard output - void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); }; - - private: - std::shared_ptr res_; -}; - -} // namespace gpu -} // namespace faiss \ No newline at end of file diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index f67f037082..a1ea05d64d 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -143,6 +143,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp index 21ac260887..615ac01fe4 100644 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp @@ -7,8 +7,8 @@ #include #include -#include -#include +#include +#include #include #include @@ -91,7 +91,7 @@ void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, - faiss::gpu::RmmGpuResources gpu_res; + faiss::gpu::StandardGpuResources gpu_res; gpu_res.setDefaultStream(opt.device, raft_handle.get_stream()); rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); @@ -139,7 +139,7 @@ void queryTest( std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; printf("Creating rmm resources\n"); - faiss::gpu::RmmGpuResources res; + faiss::gpu::StandardGpuResources res; res.noTempMemory(); faiss::gpu::GpuIndexIVFFlatConfig config; @@ -267,7 +267,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { cpuIndex.train(opt.numTrain, trainVecs.data()); cpuIndex.nprobe = opt.nprobe; - faiss::gpu::RmmGpuResources res; + faiss::gpu::StandardGpuResources res; res.noTempMemory(); faiss::gpu::GpuIndexIVFFlatConfig config; @@ -302,7 +302,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) { std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - faiss::gpu::RmmGpuResources res; + faiss::gpu::StandardGpuResources res; res.noTempMemory(); faiss::gpu::GpuIndexIVFFlatConfig config; @@ -361,7 +361,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { cpuIndex.add(opt.numAdd, addVecs.data()); // use garbage values to see if we overwrite then - faiss::gpu::RmmGpuResources res; + faiss::gpu::StandardGpuResources res; res.noTempMemory(); faiss::gpu::GpuIndexIVFFlatConfig config; @@ -508,7 +508,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { // cpuIndex.add(opt.numAdd, addVecs.data()); // cpuIndex.nprobe = opt.nprobe; // -// faiss::gpu::RmmGpuResources res; +// faiss::gpu::StandardGpuResources res; // res.noTempMemory(); // // faiss::gpu::GpuIndexIVFFlatConfig config; @@ -557,7 +557,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { // opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, // opt.dim); -// faiss::gpu::RmmGpuResources res; +// faiss::gpu::StandardGpuResources res; // res.noTempMemory(); // faiss::gpu::GpuIndexIVFFlatConfig config; @@ -596,7 +596,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { // TEST(TestRaftIndexIVFFlat, AddNaN) { // Options opt; -// faiss::gpu::RmmGpuResources res; +// faiss::gpu::StandardGpuResources res; // res.noTempMemory(); // faiss::gpu::GpuIndexIVFFlatConfig config; @@ -670,7 +670,7 @@ TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { // cpuIndex.add(numAdd, addVecs.data()); // cpuIndex.nprobe = nprobe; // -// faiss::gpu::RmmGpuResources res; +// faiss::gpu::StandardGpuResources res; // res.noTempMemory(); // // faiss::gpu::GpuIndexIVFFlatConfig config; diff --git a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp b/faiss/gpu/test/TestRaftIndexIVFPQ.cpp deleted file mode 100644 index 61a3c8870e..0000000000 --- a/faiss/gpu/test/TestRaftIndexIVFPQ.cpp +++ /dev/null @@ -1,704 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -// FIXME: figure out a better way to test fp16 -constexpr float kF16MaxRelErr = 0.3f; -constexpr float kF32MaxRelErr = 0.03f; - -struct Options { - Options() { - numAdd = 2 * faiss::gpu::randVal(50000, 70000); - dim = faiss::gpu::randVal(64, 200); - - numCentroids = std::sqrt((float)numAdd / 2); - numTrain = numCentroids * 50; - nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); - numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100); - - // Due to the approximate nature of the query and of floating point - // differences between GPU and CPU, to stay within our error bounds, - // only use a small k - k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40); - indicesOpt = faiss::gpu::randSelect( - {faiss::gpu::INDICES_CPU, - faiss::gpu::INDICES_32_BIT, - faiss::gpu::INDICES_64_BIT}); - - device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - } - - std::string toString() const { - std::stringstream str; - str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " - << dim << " numCentroids " << numCentroids << " nprobe " << nprobe - << " numQuery " << numQuery << " k " << k << " indicesOpt " - << indicesOpt; - - return str.str(); - } - - int numAdd; - int dim; - int numCentroids; - int numTrain; - int nprobe; - int numQuery; - int k; - int device; - faiss::gpu::IndicesOptions indicesOpt; -}; - -template -void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { - - uint32_t train_start = raft::curTimeMillis(); - index.train(opt.numTrain, trainVecs.data()); - raft_handle.sync_stream(); - uint32_t train_stop = raft::curTimeMillis(); - - uint32_t add_start = raft::curTimeMillis(); - index.add(opt.numAdd, addVecs.data()); - raft_handle.sync_stream(); - uint32_t add_stop = raft::curTimeMillis(); -// index.train(opt.numTrain, trainVecs.data()); - index.setNumProbes(opt.nprobe); - - std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl; -} - - -void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m, - std::vector &addVecs, std::vector &queryVecs) { - - - - faiss::gpu::RmmGpuResources gpu_res; - gpu_res.setDefaultStream(opt.device, raft_handle.get_stream()); - - rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); - raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); - - rmm::device_uvector queryVecsDev(queryVecs.size(), raft_handle.get_stream()); - raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream()); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.k = opt.k; - args.dims = opt.dim; - args.vectors = addVecs.data(); - args.vectorsRowMajor = true; - args.numVectors = opt.numAdd; - args.queries = queryVecs.data(); - args.queriesRowMajor = true; - args.numQueries = opt.numQuery; - args.outDistances = dists; - args.outIndices = inds; - args.outIndicesType = faiss::gpu::IndicesDataType::I64; - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); -} - -void queryTest( - faiss::MetricType metricType, - bool useFloat16CoarseQuantizer, - int dimOverride = -1) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; - opt.dim = dimOverride != -1 ? dimOverride : opt.dim; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - - std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; - - printf("Creating rmm resources\n"); - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - // TODO: Since we are modifying the centroids when adding new vectors, - // the neighbors are no longer going to match completely between CPU - // and the RAFT indexes. We will probably want to perform a bfknn as - // ground truth and then compare the recall for both the RAFT and FAISS - // indices. - - printf("Building raft index\n"); - faiss::gpu::RaftIndexIVFPQ raftIndex( - &res, opt.dim, opt.numCentroids, metricType, config); - - printf("Done.\n"); - - faiss::gpu::GpuIndexIVFPQ gpuIndex( - &res, opt.dim, opt.numCentroids, metricType, config); - - - printf("Creating raft handle\n"); - raft::handle_t raft_handle; - printf("Done\n"); - - std::cout << "Training raft index" << std::endl; - uint32_t r_train_start = raft::curTimeMillis(); - train_index(raft_handle, opt, raftIndex, trainVecs, addVecs); - raft_handle.sync_stream(); - uint32_t r_train_stop = raft::curTimeMillis(); - std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; - - std::cout << "Training gpu index" << std::endl; - uint32_t g_train_start = raft::curTimeMillis(); - train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); - raft_handle.sync_stream(); - uint32_t g_train_stop = raft::curTimeMillis(); - std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; - - std::cout << "Computing ground truth" << std::endl; - rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); - - std::cout << "Done." << std::endl; - raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); - raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); - - rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - uint32_t rstart = raft::curTimeMillis(); - raftIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - raft_dists.data(), - raft_inds.data()); - - raft_handle.sync_stream(); - uint32_t rstop = raft::curTimeMillis(); - std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; - - rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - uint32_t gstart = raft::curTimeMillis(); - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - gpu_dists.data(), - gpu_inds.data()); - - raft_handle.sync_stream(); - uint32_t gstop = raft::curTimeMillis(); - - std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; - - // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. - - raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout); - raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout); - -// raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); -// raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); - -// -// bool compFloat16 = useFloat16CoarseQuantizer; -// faiss::gpu::compareIndices( -// cpuIndex, -// gpuIndex, -// opt.numQuery, -// opt.dim, -// opt.k, -// opt.toString(), -// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, -// // FIXME: the fp16 bounds are -// // useless when math (the accumulator) is -// // in fp16. Figure out another way to test -// compFloat16 ? 0.70f : 0.1f, -// compFloat16 ? 0.65f : 0.015f); - } -} - -void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatL2 quantizerL2(opt.dim); - faiss::IndexFlatIP quantizerIP(opt.dim); - faiss::Index* quantizer = metricType == faiss::METRIC_L2 - ? (faiss::Index*)&quantizerL2 - : (faiss::Index*)&quantizerIP; - - faiss::IndexIVFFlat cpuIndex( - quantizer, opt.dim, opt.numCentroids, metricType); - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.nprobe = opt.nprobe; - - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFPQ gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.setNumProbes(opt.nprobe); - - cpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); - } -} - -void copyToTest(bool useFloat16CoarseQuantizer) { - Options opt; - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFPQ gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.setNumProbes(opt.nprobe); - - // use garbage values to see if we overwrite then - faiss::IndexFlatL2 cpuQuantizer(1); - faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); - cpuIndex.nprobe = 1; - - gpuIndex.copyTo(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); -} - -void copyFromTest(bool useFloat16CoarseQuantizer) { - Options opt; - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatL2 cpuQuantizer(opt.dim); - faiss::IndexIVFFlat cpuIndex( - &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2); - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - - // use garbage values to see if we overwrite then - faiss::gpu::RmmGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFPQ gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); - gpuIndex.setNumProbes(1); - - gpuIndex.copyFrom(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); -} - -//TEST(TestRaftIndexIVFPQ, Float32_32_Add_L2) { -// addTest(faiss::METRIC_L2, false); -// printf("Finished addTest(faiss::METRIC_L2, false)\n"); -//} -// -//TEST(TestRaftIndexIVFPQ, Float32_32_Add_IP) { -// addTest(faiss::METRIC_INNER_PRODUCT, false); -// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -//} -// -//TEST(TestRaftIndexIVFPQ, Float16_32_Add_L2) { -// addTest(faiss::METRIC_L2, true); -// printf("Finished addTest(faiss::METRIC_L2, true)\n"); -//} -// -//TEST(TestRaftIndexIVFPQ, Float16_32_Add_IP) { -// addTest(faiss::METRIC_INNER_PRODUCT, true); -// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -//} - -// -// General query tests -// - -TEST(TestRaftIndexIVFPQ, Float32_Query_L2) { -queryTest(faiss::METRIC_L2, false); -printf("Finished queryTest(faiss::METRIC_L2, false);\n"); -} - -//TEST(TestRaftIndexIVFPQ, Float32_Query_IP) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -//} - -// float16 coarse quantizer - -TEST(TestRaftIndexIVFPQ, Float16_32_Query_L2) { -queryTest(faiss::METRIC_L2, true); -printf("Finished queryTest(faiss::METRIC_L2, true)\n"); -} - -//TEST(TestRaftIndexIVFPQ, Float16_32_Query_IP) { -// queryTest(faiss::METRIC_INNER_PRODUCT, true); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -//} - -// -// There are IVF list scanning specializations for 64-d and 128-d that we -// make sure we explicitly test here -// - -TEST(TestRaftIndexIVFPQ, Float32_Query_L2_64) { -queryTest(faiss::METRIC_L2, false, 64); -printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n"); -} - -//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_64) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); -//} - -TEST(TestRaftIndexIVFPQ, Float32_Query_L2_128) { -queryTest(faiss::METRIC_L2, false, 128); -printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n"); -} - -//TEST(TestRaftIndexIVFPQ, Float32_Query_IP_128) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); -//} - -// -// Copy tests -// - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFPQ, Float32_32_CopyTo) { -// copyToTest(false); -// printf("Finished copyToTest(false)\n"); -// } - -//TEST(TestRaftIndexIVFPQ, Float32_32_CopyFrom) { -// copyFromTest(false); -// printf("Finished copyFromTest(false)\n"); -//} - -//TEST(TestRaftIndexIVFPQ, Float32_negative) { -// Options opt; -// -// auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); -// auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); -// -// // Put all vecs on negative side -// for (auto& f : trainVecs) { -// f = std::abs(f) * -1.0f; -// } -// -// for (auto& f : addVecs) { -// f *= std::abs(f) * -1.0f; -// } -// -// faiss::IndexFlatIP quantizerIP(opt.dim); -// faiss::Index* quantizer = (faiss::Index*)&quantizerIP; -// -// faiss::IndexIVFFlat cpuIndex( -// quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); -// cpuIndex.train(opt.numTrain, trainVecs.data()); -// cpuIndex.add(opt.numAdd, addVecs.data()); -// cpuIndex.nprobe = opt.nprobe; -// -// faiss::gpu::RmmGpuResources res; -// res.noTempMemory(); -// -// faiss::gpu::GpuIndexIVFPQConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// -// faiss::gpu::RaftIndexIVFPQ gpuIndex( -// &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); -// gpuIndex.copyFrom(&cpuIndex); -// gpuIndex.setNumProbes(opt.nprobe); -// -// // Construct a positive test set -// auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); -// -// // Put all vecs on positive size -// for (auto& f : queryVecs) { -// f = std::abs(f); -// } -// -// bool compFloat16 = false; -// faiss::gpu::compareIndices( -// queryVecs, -// cpuIndex, -// gpuIndex, -// opt.numQuery, -// opt.dim, -// opt.k, -// opt.toString(), -// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, -// // FIXME: the fp16 bounds are -// // useless when math (the accumulator) is -// // in fp16. Figure out another way to test -// compFloat16 ? 0.99f : 0.1f, -// compFloat16 ? 0.65f : 0.015f); -//} - -// -// NaN tests -// - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFPQ, QueryNaN) { -// Options opt; - -// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, -// opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, -// opt.dim); - -// faiss::gpu::RmmGpuResources res; -// res.noTempMemory(); - -// faiss::gpu::GpuIndexIVFPQConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// config.flatConfig.useFloat16 = faiss::gpu::randBool(); - -// faiss::gpu::RaftIndexIVFPQ gpuIndex( -// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -// gpuIndex.setNumProbes(opt.nprobe); - -// gpuIndex.train(opt.numTrain, trainVecs.data()); -// gpuIndex.add(opt.numAdd, addVecs.data()); - -// int numQuery = 10; -// std::vector nans( -// numQuery * opt.dim, std::numeric_limits::quiet_NaN()); - -// std::vector distances(numQuery * opt.k, 0); -// std::vector indices(numQuery * opt.k, 0); - -// gpuIndex.search( -// numQuery, nans.data(), opt.k, distances.data(), indices.data()); - -// for (int q = 0; q < numQuery; ++q) { -// for (int k = 0; k < opt.k; ++k) { -// EXPECT_EQ(indices[q * opt.k + k], -1); -// EXPECT_EQ( -// distances[q * opt.k + k], -// std::numeric_limits::max()); -// } -// } -// } - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFPQ, AddNaN) { -// Options opt; - -// faiss::gpu::RmmGpuResources res; -// res.noTempMemory(); - -// faiss::gpu::GpuIndexIVFPQConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// config.flatConfig.useFloat16 = faiss::gpu::randBool(); - -// faiss::gpu::RaftIndexIVFPQ gpuIndex( -// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -// gpuIndex.setNumProbes(opt.nprobe); - -// int numNans = 10; -// std::vector nans( -// numNans * opt.dim, std::numeric_limits::quiet_NaN()); - -// // Make one vector valid (not the first vector, in order to test offset -// // issues), which should actually add -// for (int i = 0; i < opt.dim; ++i) { -// nans[opt.dim + i] = i; -// } - -// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, -// opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data()); - -// // should not crash -// EXPECT_EQ(gpuIndex.ntotal, 0); -// gpuIndex.add(numNans, nans.data()); - -// std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, -// opt.dim); std::vector distance(opt.numQuery * opt.k, 0); -// std::vector indices(opt.numQuery * opt.k, 0); - -// // should not crash -// gpuIndex.search( -// opt.numQuery, -// queryVecs.data(), -// opt.k, -// distance.data(), -// indices.data()); -// } - -//TEST(TestRaftIndexIVFPQ, UnifiedMemory) { -// // Construct on a random device to test multi-device, if we have -// // multiple devices -// int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); -// -// if (!faiss::gpu::getFullUnifiedMemSupport(device)) { -// return; -// } -// -// int dim = 128; -// -// int numCentroids = 256; -// // Unfortunately it would take forever to add 24 GB in IVFPQ data, -// // so just perform a small test with data allocated in the unified -// // memory address space -// size_t numAdd = 10000; -// size_t numTrain = numCentroids * 40; -// int numQuery = 10; -// int k = 10; -// int nprobe = 8; -// -// std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); -// std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); -// -// faiss::IndexFlatL2 quantizer(dim); -// faiss::IndexIVFFlat cpuIndex( -// &quantizer, dim, numCentroids, faiss::METRIC_L2); -// -// cpuIndex.train(numTrain, trainVecs.data()); -// cpuIndex.add(numAdd, addVecs.data()); -// cpuIndex.nprobe = nprobe; -// -// faiss::gpu::RmmGpuResources res; -// res.noTempMemory(); -// -// faiss::gpu::GpuIndexIVFPQConfig config; -// config.device = device; -// config.memorySpace = faiss::gpu::MemorySpace::Unified; -// -// faiss::gpu::RaftIndexIVFPQ gpuIndex( -// &res, dim, numCentroids, faiss::METRIC_L2, config); -// gpuIndex.copyFrom(&cpuIndex); -// gpuIndex.setNumProbes(nprobe); -// -// faiss::gpu::compareIndices( -// cpuIndex, -// gpuIndex, -// numQuery, -// dim, -// k, -// "Unified Memory", -// kF32MaxRelErr, -// 0.1f, -// 0.015f); -//} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - - // just run with a fixed test seed - faiss::gpu::setTestSeed(100); - - return RUN_ALL_TESTS(); -} From 2ac5a5b565c4120b008b66e6559d269007c3634a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 27 Oct 2022 20:34:39 -0400 Subject: [PATCH 37/87] Validating dispatch of flatindex --- faiss/gpu/GpuIndexFlat.cu | 2 ++ faiss/gpu/impl/FlatIndex.cuh | 2 +- faiss/gpu/impl/RaftFlatIndex.cu | 3 +++ faiss/gpu/impl/RaftFlatIndex.cuh | 2 +- faiss/gpu/test/TestGpuIndexFlat.cpp | 6 ++++++ 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index 429eb64db7..3f2f1b2960 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -91,6 +91,7 @@ GpuIndexFlat::~GpuIndexFlat() {} void GpuIndexFlat::resetIndex_(int dims) { if(config_.use_raft) { + printf("Should use raft!\n"); data_.reset(new RaftFlatIndex( resources_.get(), dims, @@ -98,6 +99,7 @@ void GpuIndexFlat::resetIndex_(int dims) { config_.memorySpace)); } else { + printf("Not using raft :-(\n"); data_.reset(new FlatIndex( resources_.get(), dims, diff --git a/faiss/gpu/impl/FlatIndex.cuh b/faiss/gpu/impl/FlatIndex.cuh index 56fbe609b9..5e0979d07a 100644 --- a/faiss/gpu/impl/FlatIndex.cuh +++ b/faiss/gpu/impl/FlatIndex.cuh @@ -44,7 +44,7 @@ class FlatIndex { /// Returns a reference to our vectors currently in use (if useFloat16 mode) Tensor& getVectorsFloat16Ref(); - void query( + virtual void query( Tensor& vecs, int k, faiss::MetricType metric, diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index 1a8369a1c3..93fecdbed9 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -48,6 +48,7 @@ void RaftFlatIndex::query( // raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance); + printf("Using RAFT for FLAT!!!!\n"); // TODO: Expose the fused L2KNN through RAFT's public APIs raft::spatial::knn::detail::fusedL2Knn(dim_, inds.data_handle(), @@ -63,6 +64,8 @@ void RaftFlatIndex::query( distance); } else { + + printf("Dispathing to FAISS for FLAT!!!!\n"); FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance); } } diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh index 8a18053449..ad48102254 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cuh +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -34,7 +34,7 @@ class RaftFlatIndex : public FlatIndex { float metricArg, Tensor& outDistances, Tensor& outIndices, - bool exactDistance); + bool exactDistance) override; }; diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index 50a445092c..18c9c81b80 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -71,6 +71,7 @@ void testFlat(const TestFlatOptions& opt) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; + config.use_raft = true; config.useFloat16 = opt.useFloat16; faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config); @@ -207,6 +208,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) { faiss::gpu::GpuIndexFlatConfig config; config.device = 0; + config.use_raft = true; config.useFloat16 = false; int dim = 128; @@ -249,6 +251,7 @@ TEST(TestGpuIndexFlat, CopyFrom) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; + config.use_raft = true; config.useFloat16 = false; faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config); @@ -282,6 +285,7 @@ TEST(TestGpuIndexFlat, CopyTo) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; + config.use_raft = true; config.useFloat16 = false; faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); @@ -335,6 +339,7 @@ TEST(TestGpuIndexFlat, UnifiedMemory) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; + config.use_raft = true; config.memorySpace = faiss::gpu::MemorySpace::Unified; faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); @@ -376,6 +381,7 @@ TEST(TestGpuIndexFlat, Residual) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; + config.use_raft = true; int dim = 32; faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2); From 68944a5c25611337925ba7ea07c1d7d5270785cb Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 15 Nov 2022 12:25:53 -0500 Subject: [PATCH 38/87] 1. Verified FlatIndex tests are passing (and using RAFT for k<=64 L2 distance) 2. Verified IVF-flat addTests are returning expected results (though failing assertion as a result of the centroids being modified) Todo: Need to fill in addEncodedVectorsToList_ in RaftIVFFlat.cu in order to verify the remaining gtests --- faiss/gpu/GpuIndexIVFFlat.cu | 2 + faiss/gpu/impl/IVFBase.cuh | 2 +- faiss/gpu/impl/IVFFlat.cu | 4 +- faiss/gpu/impl/RaftFlatIndex.cu | 5 +- faiss/gpu/impl/RaftIVFFlat.cu | 102 +++++++++++++++++++++++-- faiss/gpu/impl/RaftIVFFlat.cuh | 12 +++ faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 5 ++ 7 files changed, 121 insertions(+), 11 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 033c7189c9..59f6b58330 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -127,6 +127,8 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { + + printf("Inside copyFrom\n"); DeviceScope scope(config_.device); // This will copy GpuIndexIVF data such as the coarse quantizer diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index e15c4c958d..a7d58178fb 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -111,7 +111,7 @@ class IVFBase { protected: /// Adds a set of codes and indices to a list, with the representation /// coming from the CPU equivalent - void addEncodedVectorsToList_( + virtual void addEncodedVectorsToList_( int listId, // resident on the host const void* codes, diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index a42e06cde3..b32047dc75 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; // bytes to encode a block of 32 vectors (single dimension) - int bytesPerDimBlock = bits * 32 / 8; + int bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32 // bytes to fully encode 32 vectors int bytesPerBlock = bytesPerDimBlock * dim_; @@ -91,7 +91,9 @@ std::vector IVFFlat::translateCodesToGpu_( return codes; } + bool sc = scalarQ_ ? true : false; int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc << ", bitsPerCode=" << bitsPerCode << ", interleavedLayout_=" << interleavedLayout_ << std::endl; auto up = unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index 93fecdbed9..f0283e2a00 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -34,8 +34,7 @@ void RaftFlatIndex::query( bool exactDistance) { // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used - if(k <= 64 && metric == MetricType::METRIC_L2 && - input.getStride(0) == 0 && vectors_.getStride(0) == 0) { + if(k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) { raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded : @@ -48,7 +47,6 @@ void RaftFlatIndex::query( // raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance); - printf("Using RAFT for FLAT!!!!\n"); // TODO: Expose the fused L2KNN through RAFT's public APIs raft::spatial::knn::detail::fusedL2Knn(dim_, inds.data_handle(), @@ -65,7 +63,6 @@ void RaftFlatIndex::query( } else { - printf("Dispathing to FAISS for FLAT!!!!\n"); FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance); } } diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 5563ca2eae..f619ca7f45 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -210,16 +210,108 @@ void RaftIVFFlat::searchPreassigned( // TODO: Fill this in! } -/// Copy all inverted lists from a CPU representation to ourselves + + void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { - printf("Inside RaftIVFFlat copyInvertedListsFrom\n"); + size_t nlist = ivf ? ivf->nlist : 0; + size_t ntotal = ivf ? ivf->compute_ntotal() : 0; + + printf("Inside RAFT copyInvertedListsFrom\n"); + raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); + // We need to allocate the IVF + printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal); + + std::vector list_sizes_(nlist); + std::vector list_offsets_(nlist+1); + std::vector indices_(ntotal); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_n_iters = 100; + + raft_knn_index.emplace(handle, raft_idx_params, dim_); + raft_knn_index.value().allocate(handle, ntotal, true); - ivf->print_stats(); + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + + // GPU index can only support max int entries per list + FAISS_THROW_IF_NOT_FMT( + listSize <= (size_t)std::numeric_limits::max(), + "GPU inverted list can only support " + "%zu entries; %zu found", + (size_t)std::numeric_limits::max(), + listSize); + + addEncodedVectorsToList_( + i, ivf->get_codes(i), ivf->get_ids(i), listSize); + } + + raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream()); + raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream()); - // TODO: Need to replicate copyInvertedListsFrom() in IVFBase.cu - // but populate a RAFT index. } +void RaftIVFFlat::addEncodedVectorsToList_( + int listId, + const void* codes, + const Index::idx_t* indices, + size_t numVecs) { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + // This list must already exist +// FAISS_ASSERT(listId < deviceListData_.size()); + + // This list must currently be empty +// auto& listCodes = deviceListData_[listId]; +// FAISS_ASSERT(listCodes->data.size() == 0); +// FAISS_ASSERT(listCodes->numVecs == 0); + + // If there's nothing to add, then there's nothing we have to do + if (numVecs == 0) { + return; + } + + // The GPU might have a different layout of the memory + auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); + auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); + + // We only have int32 length representations on the GPU per each + // list; the length is in sizeof(char) + FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); + + // Translate the codes as needed to our preferred form + std::vector codesV(cpuListSizeInBytes); + std::memcpy(codesV.data(), codes, cpuListSizeInBytes); + auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); + + std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl; + +// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), )) + +// listCodes->data.append( +// translatedCodes.data(), +// gpuListSizeInBytes, +// stream, +// true /* exact reserved size */); +// listCodes->numVecs = numVecs; +// +// // Handle the indices as well +// addIndicesFromCpu_(listId, indices, numVecs); +// + + // We should problay consider using this... +// deviceListDataPointers_.setAt( +// listId, (void*)listCodes->data.data(), stream); +// deviceListLengths_.setAt(listId, (int)numVecs, stream); +// +// // We update this as well, since the multi-pass algorithm uses it +// maxListLength_ = std::max(maxListLength_, (int)numVecs); +} + + /// Copy all inverted lists from ourselves to a CPU representation void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { printf("Inside RaftIVFFlat copyInvertedListsTo\n"); diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 1078204270..0bee282d26 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -87,6 +87,18 @@ class RaftIVFFlat : public IVFFlat { void copyInvertedListsTo(InvertedLists* ivf) override; protected: + + /// Adds a set of codes and indices to a list, with the representation + /// coming from the CPU equivalent + void addEncodedVectorsToList_( + int listId, + // resident on the host + const void* codes, + // resident on the host + const Index::idx_t* indices, + size_t numVecs) override; + + std::optional> raft_knn_index{std::nullopt}; }; diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index a1ea05d64d..503a655edd 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -93,11 +93,14 @@ void queryTest( faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; + config.use_raft = true; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); gpuIndex.copyFrom(&cpuIndex); + + gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; @@ -180,6 +183,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) { config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); gpuIndex.train(opt.numTrain, trainVecs.data()); @@ -238,6 +242,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); gpuIndex.setNumProbes(1); From 3a37031d2415573a544991ac0797efc3865360de Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 15 Nov 2022 13:58:47 -0500 Subject: [PATCH 39/87] Calling train() on copyFrom() with reconstructed vectors and filling in logic for updateQuantizer() --- faiss/gpu/GpuIndexIVF.cu | 1 + faiss/gpu/GpuIndexIVFFlat.cu | 29 +++- faiss/gpu/impl/IVFBase.cuh | 2 +- faiss/gpu/impl/RaftIVFFlat.cu | 241 +++++++++++++++++++-------------- faiss/gpu/impl/RaftIVFFlat.cuh | 29 ++-- 5 files changed, 187 insertions(+), 115 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index bfd5f16c8d..a5706c7954 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -466,6 +466,7 @@ void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) { if(config_.use_raft) { + printf("Using raft to train quantizer for %d vectors\n", n); const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::index_params raft_idx_params; diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 59f6b58330..676d4e376d 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -160,8 +160,33 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { config_.memorySpace); - // Copy all of the IVF data - index_->copyInvertedListsFrom(index->invlists); + if(config_.use_raft) { + + if(index->quantizer->ntotal > 0) { + auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); + auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d); + +// raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d); + + // Copy (reconstructed) centroids over, rather than re-training + std::vector buf_host(total_elems); + rmm::device_uvector buf_device(total_elems, stream); + index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data()); + raft::copy(buf_device.data(), buf_host.data(), total_elems, stream); + + printf("Calling train!\n"); + train(total_elems, buf_device.data()); + } + + if(index->ntotal > 0) { + std::vector buf_host(index->ntotal); + index->reconstruct_n(0, index->ntotal, buf_host.data()); + printf("Done reconstructing... %d\n", index->ntotal); + } + } else { + // Copy all of the IVF data + index_->copyInvertedListsFrom(index->invlists); + } } void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const { diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index a7d58178fb..60c69c1f8d 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -75,7 +75,7 @@ class IVFBase { /// Update our coarse quantizer with this quantizer instance; may be a CPU /// or GPU quantizer - void updateQuantizer(Index* quantizer); + virtual void updateQuantizer(Index* quantizer); /// Classify and encode/add vectors to our IVF lists. /// The input data must be on our current device. diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index f619ca7f45..103f56bc4c 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -110,12 +110,20 @@ int RaftIVFFlat::addVectors( const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + printf("About to call extend on index\n"); // TODO: We probably don't want to ignore the coarse quantizer here - raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( - raft_handle, - raft_knn_index.value(), - vecs_view, - std::make_optional>(inds_view))); + + if(raft_knn_index.has_value()) { + raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( + raft_handle, + raft_knn_index.value(), + vecs_view, + std::make_optional>(inds_view))); + + } else { + printf("Index has not been trained!\n"); + } + printf("Done.\n"); return vecs.getSize(0); } @@ -210,114 +218,149 @@ void RaftIVFFlat::searchPreassigned( // TODO: Fill this in! } +void RaftIVFFlat::updateQuantizer(Index* quantizer) { + Index::idx_t quantizer_ntotal = quantizer->ntotal; + std::cout << "Calling updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; + auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); -void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { - size_t nlist = ivf ? ivf->nlist : 0; - size_t ntotal = ivf ? ivf->compute_ntotal() : 0; - - printf("Inside RAFT copyInvertedListsFrom\n"); - raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); - // We need to allocate the IVF - printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal); - - std::vector list_sizes_(nlist); - std::vector list_offsets_(nlist+1); - std::vector indices_(ntotal); - - raft::neighbors::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_n_iters = 100; + auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); - raft_knn_index.emplace(handle, raft_idx_params, dim_); - raft_knn_index.value().allocate(handle, ntotal, true); + raft::spatial::knn::ivf_flat::index_params pams; - for (size_t i = 0; i < nlist; ++i) { - size_t listSize = ivf->list_size(i); - - // GPU index can only support max int entries per list - FAISS_THROW_IF_NOT_FMT( - listSize <= (size_t)std::numeric_limits::max(), - "GPU inverted list can only support " - "%zu entries; %zu found", - (size_t)std::numeric_limits::max(), - listSize); - - addEncodedVectorsToList_( - i, ivf->get_codes(i), ivf->get_ids(i), listSize); + switch (this->metric_) { + case faiss::METRIC_L2: + pams.metric = raft::distance::DistanceType::L2Expanded; + break; + case faiss::METRIC_INNER_PRODUCT: + pams.metric = raft::distance::DistanceType::InnerProduct; + break; + default: + FAISS_THROW_MSG("Metric is not supported."); } - raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream()); - raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream()); - -} - -void RaftIVFFlat::addEncodedVectorsToList_( - int listId, - const void* codes, - const Index::idx_t* indices, - size_t numVecs) { - auto stream = resources_->getDefaultStreamCurrentDevice(); + raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, (uint32_t)this->dim_); - // This list must already exist -// FAISS_ASSERT(listId < deviceListData_.size()); - - // This list must currently be empty -// auto& listCodes = deviceListData_[listId]; -// FAISS_ASSERT(listCodes->data.size() == 0); -// FAISS_ASSERT(listCodes->numVecs == 0); - - // If there's nothing to add, then there's nothing we have to do - if (numVecs == 0) { - return; + // Copy (reconstructed) centroids over, rather than re-training + rmm::device_uvector buf_dev(total_elems, stream); + { + std::vector buf_host(total_elems); + quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); + raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); } - // The GPU might have a different layout of the memory - auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); - auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); - - // We only have int32 length representations on the GPU per each - // list; the length is in sizeof(char) - FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); - - // Translate the codes as needed to our preferred form - std::vector codesV(cpuListSizeInBytes); - std::memcpy(codesV.data(), codes, cpuListSizeInBytes); - auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); - - std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl; + raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); +} -// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), )) -// listCodes->data.append( -// translatedCodes.data(), -// gpuListSizeInBytes, -// stream, -// true /* exact reserved size */); -// listCodes->numVecs = numVecs; // -// // Handle the indices as well -// addIndicesFromCpu_(listId, indices, numVecs); // - - // We should problay consider using this... -// deviceListDataPointers_.setAt( -// listId, (void*)listCodes->data.data(), stream); -// deviceListLengths_.setAt(listId, (int)numVecs, stream); +//void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { +// size_t nlist = ivf ? ivf->nlist : 0; +// size_t ntotal = ivf ? ivf->compute_ntotal() : 0; // -// // We update this as well, since the multi-pass algorithm uses it -// maxListLength_ = std::max(maxListLength_, (int)numVecs); -} - - -/// Copy all inverted lists from ourselves to a CPU representation -void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { - printf("Inside RaftIVFFlat copyInvertedListsTo\n"); - - // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu -} +// printf("Inside RAFT copyInvertedListsFrom\n"); +// raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); +// // We need to allocate the IVF +// printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal); +// +// std::vector list_sizes_(nlist); +// std::vector list_offsets_(nlist+1); +// std::vector indices_(ntotal); +// +// raft::neighbors::ivf_flat::index_params raft_idx_params; +// raft_idx_params.n_lists = nlist; +// raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; +// raft_idx_params.add_data_on_build = false; +// raft_idx_params.kmeans_n_iters = 100; +// +// raft_knn_index.emplace(handle, raft_idx_params, dim_); +// raft_knn_index.value().allocate(handle, ntotal, true); +// +// for (size_t i = 0; i < nlist; ++i) { +// size_t listSize = ivf->list_size(i); +// +// // GPU index can only support max int entries per list +// FAISS_THROW_IF_NOT_FMT( +// listSize <= (size_t)std::numeric_limits::max(), +// "GPU inverted list can only support " +// "%zu entries; %zu found", +// (size_t)std::numeric_limits::max(), +// listSize); +// +// addEncodedVectorsToList_( +// i, ivf->get_codes(i), ivf->get_ids(i), listSize); +// } +// +// raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream()); +// raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream()); +// +//} + +//void RaftIVFFlat::addEncodedVectorsToList_( +// int listId, +// const void* codes, +// const Index::idx_t* indices, +// size_t numVecs) { +// auto stream = resources_->getDefaultStreamCurrentDevice(); +// +// // This list must already exist +//// FAISS_ASSERT(listId < deviceListData_.size()); +// +// // This list must currently be empty +//// auto& listCodes = deviceListData_[listId]; +//// FAISS_ASSERT(listCodes->data.size() == 0); +//// FAISS_ASSERT(listCodes->numVecs == 0); +// +// // If there's nothing to add, then there's nothing we have to do +// if (numVecs == 0) { +// return; +// } +// +// // The GPU might have a different layout of the memory +// auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); +// auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); +// +// // We only have int32 length representations on the GPU per each +// // list; the length is in sizeof(char) +// FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); +// +// // Translate the codes as needed to our preferred form +// std::vector codesV(cpuListSizeInBytes); +// std::memcpy(codesV.data(), codes, cpuListSizeInBytes); +// auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); +// +// std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl; +// +//// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), )) +// +//// listCodes->data.append( +//// translatedCodes.data(), +//// gpuListSizeInBytes, +//// stream, +//// true /* exact reserved size */); +//// listCodes->numVecs = numVecs; +//// +//// // Handle the indices as well +//// addIndicesFromCpu_(listId, indices, numVecs); +//// +// +// // We should problay consider using this... +//// deviceListDataPointers_.setAt( +//// listId, (void*)listCodes->data.data(), stream); +//// deviceListLengths_.setAt(listId, (int)numVecs, stream); +//// +//// // We update this as well, since the multi-pass algorithm uses it +//// maxListLength_ = std::max(maxListLength_, (int)numVecs); +//} + + +///// Copy all inverted lists from ourselves to a CPU representation +//void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { +// printf("Inside RaftIVFFlat copyInvertedListsTo\n"); +// +// // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu +//} } // namespace gpu diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 0bee282d26..298a9370c9 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -80,23 +80,26 @@ class RaftIVFFlat : public IVFFlat { /// Return the encoded vectors of a particular list back to the CPU std::vector getListVectorData(int listId, bool gpuFormat) const override; - /// Copy all inverted lists from a CPU representation to ourselves - void copyInvertedListsFrom(const InvertedLists* ivf) override; + void updateQuantizer(Index* quantizer) override; - /// Copy all inverted lists from ourselves to a CPU representation - void copyInvertedListsTo(InvertedLists* ivf) override; +// +// /// Copy all inverted lists from a CPU representation to ourselves +// void copyInvertedListsFrom(const InvertedLists* ivf) override; +// +// /// Copy all inverted lists from ourselves to a CPU representation +// void copyInvertedListsTo(InvertedLists* ivf) override; protected: - /// Adds a set of codes and indices to a list, with the representation - /// coming from the CPU equivalent - void addEncodedVectorsToList_( - int listId, - // resident on the host - const void* codes, - // resident on the host - const Index::idx_t* indices, - size_t numVecs) override; +// /// Adds a set of codes and indices to a list, with the representation +// /// coming from the CPU equivalent +// void addEncodedVectorsToList_( +// int listId, +// // resident on the host +// const void* codes, +// // resident on the host +// const Index::idx_t* indices, +// size_t numVecs) override; std::optional> raft_knn_index{std::nullopt}; From 3f51425926c866bcb16c45c2ffa0faf500027b4b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 15 Nov 2022 14:11:17 -0500 Subject: [PATCH 40/87] IVFFlat gtests run through to completion without crash. Distances look very similar. It's possible indexes are being assigned incorrectly. --- faiss/gpu/GpuIndexIVFFlat.cu | 33 ++++++++++++++++----------------- faiss/gpu/impl/RaftIVFFlat.cu | 10 ++++------ 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 676d4e376d..7d65ef5827 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -114,7 +114,6 @@ void GpuIndexIVFFlat::set_index_(GpuResources* resources, baseIndex_ = std::static_pointer_cast(index_); updateQuantizer(); - } void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { @@ -162,26 +161,26 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { if(config_.use_raft) { - if(index->quantizer->ntotal > 0) { - auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); - auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d); - -// raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d); - - // Copy (reconstructed) centroids over, rather than re-training - std::vector buf_host(total_elems); - rmm::device_uvector buf_device(total_elems, stream); - index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data()); - raft::copy(buf_device.data(), buf_host.data(), total_elems, stream); - - printf("Calling train!\n"); - train(total_elems, buf_device.data()); - } +// if(index->quantizer->ntotal > 0) { +// auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); +// auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d); +// +// // Copy (reconstructed) centroids over, rather than re-training +// std::vector buf_host(total_elems); +// rmm::device_uvector buf_device(total_elems, stream); +// index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data()); +// raft::copy(buf_device.data(), buf_host.data(), total_elems, stream); +// +// printf("Calling train!\n"); +// train(total_elems, buf_device.data()); +// } if(index->ntotal > 0) { - std::vector buf_host(index->ntotal); + printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal); + std::vector buf_host(index->ntotal * index->d); index->reconstruct_n(0, index->ntotal, buf_host.data()); printf("Done reconstructing... %d\n", index->ntotal); + add(index->ntotal, buf_host.data()); } } else { // Copy all of the IVF data diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 103f56bc4c..02450ea441 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -243,13 +243,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // Copy (reconstructed) centroids over, rather than re-training rmm::device_uvector buf_dev(total_elems, stream); - { - std::vector buf_host(total_elems); - quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - } + std::vector buf_host(total_elems); + quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); + raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); + raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout); } From db1801e50f995d90ce49eb636d453c5a78818262 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 15 Nov 2022 17:09:02 -0500 Subject: [PATCH 41/87] Some of the IVFFlat tests are passing. --- cmake/thirdparty/get_raft.cmake | 4 ++-- faiss/gpu/GpuIndexIVFFlat.cu | 16 +--------------- faiss/gpu/impl/RaftIVFFlat.cu | 8 ++++++-- faiss/gpu/impl/RaftIndexIVFFlat.cu | 2 +- 4 files changed, 10 insertions(+), 20 deletions(-) diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 2b7825d193..a7ef8410da 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -16,8 +16,8 @@ set(RAFT_VERSION "${RAPIDS_VERSION}") -set(RAFT_FORK "cjnolet") -set(RAFT_PINNED_TAG "bug-2212-ivf_flat_apis") +set(RAFT_FORK "achirkin") +set(RAFT_PINNED_TAG "fea-ivf-flat-optional-adaptive-centers") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 7d65ef5827..fb9849edb9 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -161,25 +161,11 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { if(config_.use_raft) { -// if(index->quantizer->ntotal > 0) { -// auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); -// auto total_elems = size_t(index->quantizer->ntotal) * size_t(index->quantizer->d); -// -// // Copy (reconstructed) centroids over, rather than re-training -// std::vector buf_host(total_elems); -// rmm::device_uvector buf_device(total_elems, stream); -// index->quantizer->reconstruct_n(0, index->quantizer->ntotal, buf_host.data()); -// raft::copy(buf_device.data(), buf_host.data(), total_elems, stream); -// -// printf("Calling train!\n"); -// train(total_elems, buf_device.data()); -// } - + // Quantizer should already have been updated above. Add reconstructed vectors to raft index if(index->ntotal > 0) { printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal); std::vector buf_host(index->ntotal * index->d); index->reconstruct_n(0, index->ntotal, buf_host.data()); - printf("Done reconstructing... %d\n", index->ntotal); add(index->ntotal, buf_host.data()); } } else { diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 02450ea441..be40a65ca6 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -239,13 +239,17 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, (uint32_t)this->dim_); + raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_); + printf("Reconstructing\n"); // Copy (reconstructed) centroids over, rather than re-training rmm::device_uvector buf_dev(total_elems, stream); std::vector buf_host(total_elems); quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); + + printf("Copying...\n"); + + raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout); } diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu index 88c5629e71..58d10fcc63 100644 --- a/faiss/gpu/impl/RaftIndexIVFFlat.cu +++ b/faiss/gpu/impl/RaftIndexIVFFlat.cu @@ -126,7 +126,7 @@ void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(raft_handle, pams.metric, (uint32_t)this->nlist, (uint32_t)this->d); + raft_knn_index.emplace(raft_handle, pams.metric, false, (uint32_t)this->nlist, (uint32_t)this->d); // Copy (reconstructed) centroids over, rather than re-training rmm::device_uvector buf_dev(total_elems, stream); From f0bbd41fc25d09a8298a397c360cf1c1808bf6ae Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 16 Nov 2022 10:39:11 -0500 Subject: [PATCH 42/87] CLeaning up the diff a bit --- faiss/gpu/CMakeLists.txt | 2 - faiss/gpu/GpuDistance.cu | 85 ++- faiss/gpu/test/CMakeLists.txt | 5 - faiss/gpu/test/TestRaftIndexIVFFlat.cpp | 704 ------------------------ 4 files changed, 32 insertions(+), 764 deletions(-) delete mode 100644 faiss/gpu/test/TestRaftIndexIVFFlat.cpp diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 0e82af813c..1d81848317 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -166,11 +166,9 @@ set(FAISS_GPU_HEADERS if(FAISS_ENABLE_RAFT) list(APPEND FAISS_GPU_HEADERS - impl/RaftIndexIVFFlat.h impl/RaftFlatIndex.cuh impl/RaftIVFFlat.cuh) list(APPEND FAISS_GPU_SRC - impl/RaftIndexIVFFlat.cu impl/RaftFlatIndex.cu impl/RaftIVFFlat.cu) endif() diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index ba1056f04a..8dca4043a6 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -14,11 +14,6 @@ #include #include -#ifdef FAISS_ENABLE_RAFT -// TODO: Expose fused_l2_knn -#include -#endif - namespace faiss { namespace gpu { @@ -107,31 +102,21 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { // Since we've guaranteed that all arguments are on device, call the // implementation - -#ifdef FAISS_ENABLE_RAFT - // TODO: When k <= 64, invoke bfknn from RAFT - if (args.k <= 64) { - - } else -#endif - - { - bfKnnOnDevice( - res, - device, - stream, - tVectors, - args.vectorsRowMajor, - args.vectorNorms ? &tVectorNorms : nullptr, - tQueries, - args.queriesRowMajor, - args.k, - args.metric, - args.metricArg, - tOutDistances, - tOutIntIndices, - args.ignoreOutDistances); - } + bfKnnOnDevice( + res, + device, + stream, + tVectors, + args.vectorsRowMajor, + args.vectorNorms ? &tVectorNorms : nullptr, + tQueries, + args.queriesRowMajor, + args.k, + args.metric, + args.metricArg, + tOutDistances, + tOutIntIndices, + args.ignoreOutDistances); // Convert and copy int indices out auto tOutIndices = toDeviceTemporary( res, @@ -160,29 +145,23 @@ void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { stream, {args.numQueries, args.k}); -#if defined FAISS_ENABLE_RAFT - if (args.k <= 64) { - } else -#endif - { - // Since we've guaranteed that all arguments are on device, call the - // implementation - bfKnnOnDevice( - res, - device, - stream, - tVectors, - args.vectorsRowMajor, - args.vectorNorms ? &tVectorNorms : nullptr, - tQueries, - args.queriesRowMajor, - args.k, - args.metric, - args.metricArg, - tOutDistances, - tOutIntIndices, - args.ignoreOutDistances); - } + // Since we've guaranteed that all arguments are on device, call the + // implementation + bfKnnOnDevice( + res, + device, + stream, + tVectors, + args.vectorsRowMajor, + args.vectorNorms ? &tVectorNorms : nullptr, + tQueries, + args.queriesRowMajor, + args.k, + args.metric, + args.metricArg, + tOutDistances, + tOutIntIndices, + args.ignoreOutDistances); // Copy back if necessary fromDevice(tOutIntIndices, (int*)args.outIndices, stream); diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt index 3eb454c95f..251c501bea 100644 --- a/faiss/gpu/test/CMakeLists.txt +++ b/faiss/gpu/test/CMakeLists.txt @@ -29,11 +29,6 @@ faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp) faiss_gpu_test(TestGpuDistance.cu) faiss_gpu_test(TestGpuSelect.cu) - -if(FAISS_ENABLE_RAFT) - faiss_gpu_test(TestRaftIndexIVFFlat.cpp) -endif() - add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL demo_ivfpq_indexing_gpu.cpp) diff --git a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp b/faiss/gpu/test/TestRaftIndexIVFFlat.cpp deleted file mode 100644 index 615ac01fe4..0000000000 --- a/faiss/gpu/test/TestRaftIndexIVFFlat.cpp +++ /dev/null @@ -1,704 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -// FIXME: figure out a better way to test fp16 -constexpr float kF16MaxRelErr = 0.3f; -constexpr float kF32MaxRelErr = 0.03f; - -struct Options { - Options() { - numAdd = 2 * faiss::gpu::randVal(50000, 70000); - dim = faiss::gpu::randVal(64, 200); - - numCentroids = std::sqrt((float)numAdd / 2); - numTrain = numCentroids * 50; - nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); - numQuery = numAdd / 10;//faiss::gpu::randVal(32, 100); - - // Due to the approximate nature of the query and of floating point - // differences between GPU and CPU, to stay within our error bounds, - // only use a small k - k = std::min(faiss::gpu::randVal(10, 30), numAdd / 40); - indicesOpt = faiss::gpu::randSelect( - {faiss::gpu::INDICES_CPU, - faiss::gpu::INDICES_32_BIT, - faiss::gpu::INDICES_64_BIT}); - - device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - } - - std::string toString() const { - std::stringstream str; - str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " - << dim << " numCentroids " << numCentroids << " nprobe " << nprobe - << " numQuery " << numQuery << " k " << k << " indicesOpt " - << indicesOpt; - - return str.str(); - } - - int numAdd; - int dim; - int numCentroids; - int numTrain; - int nprobe; - int numQuery; - int k; - int device; - faiss::gpu::IndicesOptions indicesOpt; -}; - -template -void train_index(const raft::handle_t &raft_handle, Options &opt, idx_type &index, std::vector &trainVecs, std::vector &addVecs) { - - uint32_t train_start = raft::curTimeMillis(); - index.train(opt.numTrain, trainVecs.data()); - raft_handle.sync_stream(); - uint32_t train_stop = raft::curTimeMillis(); - - uint32_t add_start = raft::curTimeMillis(); - index.add(opt.numAdd, addVecs.data()); - raft_handle.sync_stream(); - uint32_t add_stop = raft::curTimeMillis(); -// index.train(opt.numTrain, trainVecs.data()); - index.setNumProbes(opt.nprobe); - - std::cout << "train=" << (train_stop - train_start) << ", add=" << (add_stop - add_start) << std::endl; -} - - -void invoke_bfknn(const raft::handle_t &raft_handle, Options &opt, float *dists, faiss::Index::idx_t *inds, faiss::MetricType m, - std::vector &addVecs, std::vector &queryVecs) { - - - - faiss::gpu::StandardGpuResources gpu_res; - gpu_res.setDefaultStream(opt.device, raft_handle.get_stream()); - - rmm::device_uvector addVecsDev(addVecs.size(), raft_handle.get_stream()); - raft::copy(addVecsDev.data(), addVecs.data(), addVecs.size(), raft_handle.get_stream()); - - rmm::device_uvector queryVecsDev(queryVecs.size(), raft_handle.get_stream()); - raft::copy(queryVecsDev.data(), queryVecs.data(), queryVecs.size(), raft_handle.get_stream()); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.k = opt.k; - args.dims = opt.dim; - args.vectors = addVecs.data(); - args.vectorsRowMajor = true; - args.numVectors = opt.numAdd; - args.queries = queryVecs.data(); - args.queriesRowMajor = true; - args.numQueries = opt.numQuery; - args.outDistances = dists; - args.outIndices = inds; - args.outIndicesType = faiss::gpu::IndicesDataType::I64; - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); -} - -void queryTest( - faiss::MetricType metricType, - bool useFloat16CoarseQuantizer, - int dimOverride = -1) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; - opt.dim = dimOverride != -1 ? dimOverride : opt.dim; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - - std::cout << "numTrain: " << opt.numTrain << "numCentroids: " << opt.numCentroids << std::endl; - - printf("Creating rmm resources\n"); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - // TODO: Since we are modifying the centroids when adding new vectors, - // the neighbors are no longer going to match completely between CPU - // and the RAFT indexes. We will probably want to perform a bfknn as - // ground truth and then compare the recall for both the RAFT and FAISS - // indices. - - printf("Building raft index\n"); - faiss::gpu::RaftIndexIVFFlat raftIndex( - &res, opt.dim, opt.numCentroids, metricType, config); - - printf("Done.\n"); - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, metricType, config); - - - printf("Creating raft handle\n"); - raft::handle_t raft_handle; - printf("Done\n"); - - std::cout << "Training raft index" << std::endl; - uint32_t r_train_start = raft::curTimeMillis(); - train_index(raft_handle, opt, raftIndex, trainVecs, addVecs); - raft_handle.sync_stream(); - uint32_t r_train_stop = raft::curTimeMillis(); - std::cout << "Raft train time " << r_train_start << " " << r_train_stop << " " << (r_train_stop - r_train_start) << std::endl; - - std::cout << "Training gpu index" << std::endl; - uint32_t g_train_start = raft::curTimeMillis(); - train_index(raft_handle, opt, gpuIndex, trainVecs, addVecs); - raft_handle.sync_stream(); - uint32_t g_train_stop = raft::curTimeMillis(); - std::cout << "FAISS train time " << g_train_start << " " << g_train_stop << " " << (g_train_stop - g_train_start) << std::endl; - - std::cout << "Computing ground truth" << std::endl; - rmm::device_uvector ref_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector ref_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - invoke_bfknn(raft_handle, opt, ref_dists.data(), ref_inds.data(), metricType, addVecs, queryVecs); - - std::cout << "Done." << std::endl; - raft::print_device_vector("ref_dists", ref_dists.data(), opt.k, std::cout); - raft::print_device_vector("ref_inds", ref_inds.data(), opt.k, std::cout); - - rmm::device_uvector raft_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector raft_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - uint32_t rstart = raft::curTimeMillis(); - raftIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - raft_dists.data(), - raft_inds.data()); - - raft_handle.sync_stream(); - uint32_t rstop = raft::curTimeMillis(); - std::cout << "Raft query time " << rstart << " " << rstop << " " << (rstop - rstart) << std::endl; - - rmm::device_uvector gpu_inds(opt.numQuery * opt.k, raft_handle.get_stream()); - rmm::device_uvector gpu_dists(opt.numQuery * opt.k, raft_handle.get_stream()); - - uint32_t gstart = raft::curTimeMillis(); - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - gpu_dists.data(), - gpu_inds.data()); - - raft_handle.sync_stream(); - uint32_t gstop = raft::curTimeMillis(); - - std::cout << "FAISS query time " << gstart << " " << gstop << " " << (gstop - gstart) << std::endl; - - // TODO: Compare recall, perhaps by adding the indices/distances to a hashmap. - - raft::print_device_vector("raft_dists", raft_dists.data(), opt.k, std::cout); - raft::print_device_vector("raft_inds", raft_inds.data(), opt.k, std::cout); - -// raft::print_device_vector("gpu_dists", gpu_dists.data(), opt.k, std::cout); -// raft::print_device_vector("gpu_inds", gpu_inds.data(), opt.k, std::cout); - -// -// bool compFloat16 = useFloat16CoarseQuantizer; -// faiss::gpu::compareIndices( -// cpuIndex, -// gpuIndex, -// opt.numQuery, -// opt.dim, -// opt.k, -// opt.toString(), -// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, -// // FIXME: the fp16 bounds are -// // useless when math (the accumulator) is -// // in fp16. Figure out another way to test -// compFloat16 ? 0.70f : 0.1f, -// compFloat16 ? 0.65f : 0.015f); - } -} - -void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatL2 quantizerL2(opt.dim); - faiss::IndexFlatIP quantizerIP(opt.dim); - faiss::Index* quantizer = metricType == faiss::METRIC_L2 - ? (faiss::Index*)&quantizerL2 - : (faiss::Index*)&quantizerIP; - - faiss::IndexIVFFlat cpuIndex( - quantizer, opt.dim, opt.numCentroids, metricType); - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.nprobe = opt.nprobe; - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.setNumProbes(opt.nprobe); - - cpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); - } -} - -void copyToTest(bool useFloat16CoarseQuantizer) { - Options opt; - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.setNumProbes(opt.nprobe); - - // use garbage values to see if we overwrite then - faiss::IndexFlatL2 cpuQuantizer(1); - faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); - cpuIndex.nprobe = 1; - - gpuIndex.copyTo(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); -} - -void copyFromTest(bool useFloat16CoarseQuantizer) { - Options opt; - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatL2 cpuQuantizer(opt.dim); - faiss::IndexIVFFlat cpuIndex( - &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2); - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - - // use garbage values to see if we overwrite then - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - faiss::gpu::RaftIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); - gpuIndex.setNumProbes(1); - - gpuIndex.copyFrom(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes()); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); -} - -//TEST(TestRaftIndexIVFFlat, Float32_32_Add_L2) { -// addTest(faiss::METRIC_L2, false); -// printf("Finished addTest(faiss::METRIC_L2, false)\n"); -//} -// -//TEST(TestRaftIndexIVFFlat, Float32_32_Add_IP) { -// addTest(faiss::METRIC_INNER_PRODUCT, false); -// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -//} -// -//TEST(TestRaftIndexIVFFlat, Float16_32_Add_L2) { -// addTest(faiss::METRIC_L2, true); -// printf("Finished addTest(faiss::METRIC_L2, true)\n"); -//} -// -//TEST(TestRaftIndexIVFFlat, Float16_32_Add_IP) { -// addTest(faiss::METRIC_INNER_PRODUCT, true); -// printf("Finished addTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -//} - -// -// General query tests -// - -TEST(TestRaftIndexIVFFlat, Float32_Query_L2) { - queryTest(faiss::METRIC_L2, false); - printf("Finished queryTest(faiss::METRIC_L2, false);\n"); -} - -//TEST(TestRaftIndexIVFFlat, Float32_Query_IP) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false)\n"); -//} - -// float16 coarse quantizer - -TEST(TestRaftIndexIVFFlat, Float16_32_Query_L2) { - queryTest(faiss::METRIC_L2, true); - printf("Finished queryTest(faiss::METRIC_L2, true)\n"); -} - -//TEST(TestRaftIndexIVFFlat, Float16_32_Query_IP) { -// queryTest(faiss::METRIC_INNER_PRODUCT, true); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, true)\n"); -//} - -// -// There are IVF list scanning specializations for 64-d and 128-d that we -// make sure we explicitly test here -// - -TEST(TestRaftIndexIVFFlat, Float32_Query_L2_64) { - queryTest(faiss::METRIC_L2, false, 64); - printf("Finished queryTest(faiss::METRIC_L2, false, 64)\n"); -} - -//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_64) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false, 64); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 64)\n"); -//} - -TEST(TestRaftIndexIVFFlat, Float32_Query_L2_128) { - queryTest(faiss::METRIC_L2, false, 128); - printf("Finished queryTest(faiss::METRIC_L2, false, 128)\n"); -} - -//TEST(TestRaftIndexIVFFlat, Float32_Query_IP_128) { -// queryTest(faiss::METRIC_INNER_PRODUCT, false, 128); -// printf("Finished queryTest(faiss::METRIC_INNER_PRODUCT, false, 128)\n"); -//} - -// -// Copy tests -// - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFFlat, Float32_32_CopyTo) { -// copyToTest(false); -// printf("Finished copyToTest(false)\n"); -// } - -//TEST(TestRaftIndexIVFFlat, Float32_32_CopyFrom) { -// copyFromTest(false); -// printf("Finished copyFromTest(false)\n"); -//} - -//TEST(TestRaftIndexIVFFlat, Float32_negative) { -// Options opt; -// -// auto trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); -// auto addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); -// -// // Put all vecs on negative side -// for (auto& f : trainVecs) { -// f = std::abs(f) * -1.0f; -// } -// -// for (auto& f : addVecs) { -// f *= std::abs(f) * -1.0f; -// } -// -// faiss::IndexFlatIP quantizerIP(opt.dim); -// faiss::Index* quantizer = (faiss::Index*)&quantizerIP; -// -// faiss::IndexIVFFlat cpuIndex( -// quantizer, opt.dim, opt.numCentroids, faiss::METRIC_INNER_PRODUCT); -// cpuIndex.train(opt.numTrain, trainVecs.data()); -// cpuIndex.add(opt.numAdd, addVecs.data()); -// cpuIndex.nprobe = opt.nprobe; -// -// faiss::gpu::StandardGpuResources res; -// res.noTempMemory(); -// -// faiss::gpu::GpuIndexIVFFlatConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// -// faiss::gpu::RaftIndexIVFFlat gpuIndex( -// &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); -// gpuIndex.copyFrom(&cpuIndex); -// gpuIndex.setNumProbes(opt.nprobe); -// -// // Construct a positive test set -// auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); -// -// // Put all vecs on positive size -// for (auto& f : queryVecs) { -// f = std::abs(f); -// } -// -// bool compFloat16 = false; -// faiss::gpu::compareIndices( -// queryVecs, -// cpuIndex, -// gpuIndex, -// opt.numQuery, -// opt.dim, -// opt.k, -// opt.toString(), -// compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, -// // FIXME: the fp16 bounds are -// // useless when math (the accumulator) is -// // in fp16. Figure out another way to test -// compFloat16 ? 0.99f : 0.1f, -// compFloat16 ? 0.65f : 0.015f); -//} - -// -// NaN tests -// - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFFlat, QueryNaN) { -// Options opt; - -// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, -// opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, -// opt.dim); - -// faiss::gpu::StandardGpuResources res; -// res.noTempMemory(); - -// faiss::gpu::GpuIndexIVFFlatConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// config.flatConfig.useFloat16 = faiss::gpu::randBool(); - -// faiss::gpu::RaftIndexIVFFlat gpuIndex( -// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -// gpuIndex.setNumProbes(opt.nprobe); - -// gpuIndex.train(opt.numTrain, trainVecs.data()); -// gpuIndex.add(opt.numAdd, addVecs.data()); - -// int numQuery = 10; -// std::vector nans( -// numQuery * opt.dim, std::numeric_limits::quiet_NaN()); - -// std::vector distances(numQuery * opt.k, 0); -// std::vector indices(numQuery * opt.k, 0); - -// gpuIndex.search( -// numQuery, nans.data(), opt.k, distances.data(), indices.data()); - -// for (int q = 0; q < numQuery; ++q) { -// for (int k = 0; k < opt.k; ++k) { -// EXPECT_EQ(indices[q * opt.k + k], -1); -// EXPECT_EQ( -// distances[q * opt.k + k], -// std::numeric_limits::max()); -// } -// } -// } - -/** TODO: test crashes */ -// TEST(TestRaftIndexIVFFlat, AddNaN) { -// Options opt; - -// faiss::gpu::StandardGpuResources res; -// res.noTempMemory(); - -// faiss::gpu::GpuIndexIVFFlatConfig config; -// config.device = opt.device; -// config.indicesOptions = opt.indicesOpt; -// config.flatConfig.useFloat16 = faiss::gpu::randBool(); - -// faiss::gpu::RaftIndexIVFFlat gpuIndex( -// &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); -// gpuIndex.setNumProbes(opt.nprobe); - -// int numNans = 10; -// std::vector nans( -// numNans * opt.dim, std::numeric_limits::quiet_NaN()); - -// // Make one vector valid (not the first vector, in order to test offset -// // issues), which should actually add -// for (int i = 0; i < opt.dim; ++i) { -// nans[opt.dim + i] = i; -// } - -// std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, -// opt.dim); gpuIndex.train(opt.numTrain, trainVecs.data()); - -// // should not crash -// EXPECT_EQ(gpuIndex.ntotal, 0); -// gpuIndex.add(numNans, nans.data()); - -// std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, -// opt.dim); std::vector distance(opt.numQuery * opt.k, 0); -// std::vector indices(opt.numQuery * opt.k, 0); - -// // should not crash -// gpuIndex.search( -// opt.numQuery, -// queryVecs.data(), -// opt.k, -// distance.data(), -// indices.data()); -// } - -//TEST(TestRaftIndexIVFFlat, UnifiedMemory) { -// // Construct on a random device to test multi-device, if we have -// // multiple devices -// int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); -// -// if (!faiss::gpu::getFullUnifiedMemSupport(device)) { -// return; -// } -// -// int dim = 128; -// -// int numCentroids = 256; -// // Unfortunately it would take forever to add 24 GB in IVFPQ data, -// // so just perform a small test with data allocated in the unified -// // memory address space -// size_t numAdd = 10000; -// size_t numTrain = numCentroids * 40; -// int numQuery = 10; -// int k = 10; -// int nprobe = 8; -// -// std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); -// std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); -// -// faiss::IndexFlatL2 quantizer(dim); -// faiss::IndexIVFFlat cpuIndex( -// &quantizer, dim, numCentroids, faiss::METRIC_L2); -// -// cpuIndex.train(numTrain, trainVecs.data()); -// cpuIndex.add(numAdd, addVecs.data()); -// cpuIndex.nprobe = nprobe; -// -// faiss::gpu::StandardGpuResources res; -// res.noTempMemory(); -// -// faiss::gpu::GpuIndexIVFFlatConfig config; -// config.device = device; -// config.memorySpace = faiss::gpu::MemorySpace::Unified; -// -// faiss::gpu::RaftIndexIVFFlat gpuIndex( -// &res, dim, numCentroids, faiss::METRIC_L2, config); -// gpuIndex.copyFrom(&cpuIndex); -// gpuIndex.setNumProbes(nprobe); -// -// faiss::gpu::compareIndices( -// cpuIndex, -// gpuIndex, -// numQuery, -// dim, -// k, -// "Unified Memory", -// kF32MaxRelErr, -// 0.1f, -// 0.015f); -//} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - - // just run with a fixed test seed - faiss::gpu::setTestSeed(100); - - return RUN_ALL_TESTS(); -} From f7da008bda856d50537472ee8a35223e0644d873 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 16 Nov 2022 12:27:26 -0500 Subject: [PATCH 43/87] Removing the RaftIndex* files. --- faiss/gpu/impl/RaftIndexIVFFlat.cu | 378 ----------------------------- faiss/gpu/impl/RaftIndexIVFFlat.h | 112 --------- 2 files changed, 490 deletions(-) delete mode 100644 faiss/gpu/impl/RaftIndexIVFFlat.cu delete mode 100644 faiss/gpu/impl/RaftIndexIVFFlat.h diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.cu b/faiss/gpu/impl/RaftIndexIVFFlat.cu deleted file mode 100644 index 58d10fcc63..0000000000 --- a/faiss/gpu/impl/RaftIndexIVFFlat.cu +++ /dev/null @@ -1,378 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include // for SearchParametersIVF -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -namespace faiss { -namespace gpu { - -RaftIndexIVFFlat::RaftIndexIVFFlat( - GpuResourcesProvider* provider, - const faiss::IndexIVFFlat* index, - GpuIndexIVFFlatConfig config) - : GpuIndexIVFFlat(provider, index, config), - raft_handle(resources_->getDefaultStream(config_.device)) { - copyFrom(index); -} - -RaftIndexIVFFlat::RaftIndexIVFFlat( - GpuResourcesProvider* provider, - int dims, - int nlist, - faiss::MetricType metric, - GpuIndexIVFFlatConfig config) - : GpuIndexIVFFlat(provider, dims, nlist, metric, config), - raft_handle(resources_->getDefaultStream(config_.device)) { - - std::cout << "In raft index constructor" << std::endl; -} - - -RaftIndexIVFFlat::RaftIndexIVFFlat( - GpuResourcesProvider* provider, - Index *coarse_quantizer, - int dims, - int nlist, - faiss::MetricType metric, - GpuIndexIVFFlatConfig config) - : GpuIndexIVFFlat(provider, coarse_quantizer, dims, nlist, metric, config), - raft_handle(resources_->getDefaultStream(config_.device)) { - - std::cout << "In raft index constructor" << std::endl; -} - - -RaftIndexIVFFlat::~RaftIndexIVFFlat() { - RaftIndexIVFFlat::reset(); -} - -void RaftIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - DeviceScope scope(config_.device); - GpuIndex::copyFrom(index); - FAISS_ASSERT(index->nlist > 0); -// FAISS_THROW_IF_NOT_FMT( -// index->nlist <= (Index::idx_t)std::numeric_limits::max(), -// "GPU index only supports %zu inverted lists", -// (size_t)std::numeric_limits::max()); -// FAISS_THROW_IF_NOT_FMT( -// index->nprobe > 0 && index->nprobe <= getMaxKSelection(), -// "GPU index only supports nprobe <= %zu; passed %zu", -// (size_t)getMaxKSelection(), -// index->nprobe); - - /** - * TODO: Copy centers and center norms from quantizer - * Things to do: - * 1. Copy index_->quantizer->vectors_ to raft_index->centers - * 2. Copy index_->quantizer->norms_ to raft_index->center_norms - */ - /** - * TODO: Copy IVF data, indices, list_sizes, list_offsets from index->invlists - * - * Things to do: - * 1. index->ivflists->data() is going to need to be translated over to our format - * (even the interleaved format is a little different) - * - * The GpuIndexIVFFlat has a function translateCodesToGpu_() for this - * - * 2. We will need to copy list_sizes, indices, and list_offsets - */ - if (index->is_trained) { - // TODO: A proper copy of the index without retraining - // For now, just get all the data from the index, and train our index - // anew. - FAISS_ASSERT(index->d == this->d); - FAISS_ASSERT(index->metric_arg == this->metric_arg); - FAISS_ASSERT(index->metric_type == this->metric_type); - FAISS_ASSERT(index->nlist == this->nlist); - - Index::idx_t quantizer_ntotal = index->quantizer->ntotal; - Index::idx_t index_ntotal = index->ntotal; - - std::cout << "Calling copyFrom with trained index with " << quantizer_ntotal << " items" << std::endl; - auto stream = raft_handle.get_stream(); - - auto total_elems = size_t(quantizer_ntotal) * size_t(index->quantizer->d); - - raft::spatial::knn::ivf_flat::index_params pams; - - switch (this->metric_type) { - case faiss::METRIC_L2: - pams.metric = raft::distance::DistanceType::L2Expanded; - break; - case faiss::METRIC_INNER_PRODUCT: - pams.metric = raft::distance::DistanceType::InnerProduct; - break; - default: - FAISS_THROW_MSG("Metric is not supported."); - } - - raft_knn_index.emplace(raft_handle, pams.metric, false, (uint32_t)this->nlist, (uint32_t)this->d); - - // Copy (reconstructed) centroids over, rather than re-training - rmm::device_uvector buf_dev(total_elems, stream); - { - std::vector buf_host(total_elems); - index->quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - raft::copy(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - } - - raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); - - // Add (reconstructed) vectors to index if needed - if(index_ntotal > 0) { - std::cout << "Adding " << index_ntotal << " vectors to index" << std::endl; - total_elems = size_t(index_ntotal) * size_t(index->d); - buf_dev.resize(total_elems, stream); - { - std::vector buf_host(total_elems); - index->reconstruct_n(0, index_ntotal, buf_host.data()); - raft::copy(buf_dev.data(), buf_host.data(), total_elems, stream); - } - - // TODO: We might want to consider moving the centroid norm computation - // outside of the incremental add on the RAFT side. - RaftIndexIVFFlat::addImpl_(index_ntotal, buf_dev.data(), nullptr); - } - } else { - // index is not trained, so we can remove ours as well (if there was - // any) - std::cout << "Calling copyFrom with index that hasn't been trained" << std::endl; - raft_knn_index.reset(); - } - this->is_trained = index->is_trained; -} - -void RaftIndexIVFFlat::reserveMemory(size_t numVecs) { - std::cout << "Reserving memory for " << numVecs << " vectors." << std::endl; - reserveMemoryVecs_ = numVecs; - if (raft_knn_index.has_value()) { - DeviceScope scope(config_.device); - - // TODO: Need to figure out if this is absolutely necessary. - - /** - * For example: - * raft::spatial::knn::ivf_flat::allocate_ivf_lists( - * raft_handle, *raft_knn_index, numVecs); - * - * raft::spatial::knn::ivf_flat::populate( - * raft_handle, *raft_knn_index, - * n_centroids, centroids, - * n_vectors, ivf); - * - */ - } -} - -size_t RaftIndexIVFFlat::reclaimMemory() { - std::cout << "Reclaiming memory" << std::endl; - - // TODO: Need to figure out if this is absolutely necessary - /** - * For example: - * raft::spatial::knn::ivf_flat::reclaim_ivf_lists( - * raft_handle, *raft_knn_index, numVecs); - */ - return 0; -} - -void RaftIndexIVFFlat::train(Index::idx_t n, const float* x) { - DeviceScope scope(config_.device); - - - raft::common::nvtx::range fun_scope( - "RaftIndexIVFFlat::train (%ld)", n); - - std::cout << "Calling train() with " << n << " rows" << std::endl; - - uint32_t start = raft::curTimeMillis(); - if (this->is_trained) { - FAISS_ASSERT(raft_knn_index.has_value()); - return; - } - - raft::spatial::knn::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_n_iters = 100; - - raft_knn_index.emplace( - raft::spatial::knn::ivf_flat::build(raft_handle, raft_idx_params, - const_cast(x), - n, (faiss::Index::idx_t)d)); - - raft_handle.sync_stream(); - uint32_t stop = raft::curTimeMillis(); - - std::cout << "train took " << (stop - start) << "ms. " << std::endl; - this->is_trained = true; -} - -int RaftIndexIVFFlat::getListLength(int listId) const { - FAISS_ASSERT(raft_knn_index.has_value()); - DeviceScope scope(config_.device); - - uint32_t size; - raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, - 1, raft_handle.get_stream()); - raft_handle.sync_stream(); - return int(size); -} - -std::vector RaftIndexIVFFlat::getListVectorData( - int listId, - bool gpuFormat) const { - FAISS_ASSERT(raft_knn_index.has_value()); - DeviceScope scope(config_.device); - - std::cout << "Calling getListVectorData for " << listId << std::endl; - - using elem_t = decltype(raft_knn_index.value().data())::element_type; - size_t dim = raft_knn_index.value().dim(); - Index::idx_t offsets[2]; - raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream()); - - raft_handle.sync_stream(); - size_t byte_offset = offsets[0] * sizeof(elem_t) * dim; - // the interleaved block can be slightly larger than the list size (it's - // rounded up) - size_t byte_size = size_t(offsets[1]) * - sizeof(elem_t) * dim - - byte_offset; - std::vector vec(byte_size); - raft::copy( - vec.data(), - reinterpret_cast(raft_knn_index.value().data().data_handle()) + - byte_offset, - byte_size, - raft_handle.get_stream()); - return vec; -} - -void RaftIndexIVFFlat::reset() { - raft_knn_index.reset(); - this->ntotal = 0; -} - -std::vector RaftIndexIVFFlat::getListIndices(int listId) const { - FAISS_ASSERT(raft_knn_index.has_value()); - DeviceScope scope(config_.device); - - Index::idx_t offset; - uint32_t size; - - raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream()); - raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); - raft_handle.sync_stream(); - - std::vector vec(size); - raft::copy( - vec.data(), - raft_knn_index.value().indices().data_handle() + offset, - size, - raft_handle.get_stream()); - return vec; -} - -void RaftIndexIVFFlat::addImpl_( - int n, - const float* x, - const Index::idx_t* xids) { - // Device is already set in GpuIndex::add - FAISS_ASSERT(is_trained); - FAISS_ASSERT(n > 0); - -// // Not all vectors may be able to be added (some may contain NaNs etc) -// index_->addVectors(data, labels); - - // but keep the ntotal based on the total number of vectors that we - // attempted to add - - std::cout << "Calling addImpl_ with " << n << " vectors." << std::endl; - - raft_knn_index.emplace(raft::spatial::knn::ivf_flat::extend( - raft_handle, raft_knn_index.value(), x, xids, (Index::idx_t)n)); - this->ntotal += n; -} - -void RaftIndexIVFFlat::searchImpl_( - int n, - const float* x, - int k, - float* distances, - Index::idx_t* labels, - const SearchParameters *params) const { - - raft::common::nvtx::range fun_scope( - "RaftIndexIVFFlat::searchImpl_ (%ld)", n); - - // Device is already set in GpuIndex::search - FAISS_ASSERT(raft_knn_index.has_value()); - FAISS_ASSERT(n > 0); - FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= nlist); - - raft::spatial::knn::ivf_flat::search_params pams; - pams.n_probes = nprobe; - raft::spatial::knn::ivf_flat::search( - raft_handle, - pams, - *raft_knn_index, - const_cast(x), - static_cast(n), - static_cast(k), - labels, - distances); - - raft_handle.sync_stream(); -} - -void RaftIndexIVFFlat::rebuildRaftIndex(const float* x, Index::idx_t n_rows) { - raft::spatial::knn::ivf_flat::index_params pams; - - std::cout << "Calling rebuildRaftIndex with " << n_rows << " rows" << std::endl; - pams.n_lists = this->nlist; - switch (this->metric_type) { - case faiss::METRIC_L2: - pams.metric = raft::distance::DistanceType::L2Expanded; - break; - case faiss::METRIC_INNER_PRODUCT: - pams.metric = raft::distance::DistanceType::InnerProduct; - break; - default: - FAISS_THROW_MSG("Metric is not supported."); - } - pams.metric_arg = this->metric_arg; - pams.kmeans_trainset_fraction = 1.0; - pams.add_data_on_build = false; - - raft_knn_index.emplace(raft::spatial::knn::ivf_flat::build( - this->raft_handle, pams, x, n_rows, uint32_t(this->d))); - - this->raft_handle.sync_stream(); - this->is_trained = true; - this->ntotal = n_rows; -} - -} // namespace gpu -} // namespace faiss diff --git a/faiss/gpu/impl/RaftIndexIVFFlat.h b/faiss/gpu/impl/RaftIndexIVFFlat.h deleted file mode 100644 index eaeabafce6..0000000000 --- a/faiss/gpu/impl/RaftIndexIVFFlat.h +++ /dev/null @@ -1,112 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include // for SearchParametersIVF -#include -#include - -#include -#include - -#include - -namespace faiss { -struct IndexIVFFlat; -} - -namespace faiss { -namespace gpu { - -/// Wrapper around the GPU implementation that looks like -/// faiss::gpu::GpuIndexIVFFlat -class RaftIndexIVFFlat : public GpuIndexIVFFlat { - public: - /// Construct from a pre-existing faiss::IndexIVFFlat instance, copying - /// data over to the given GPU, if the input index is trained. - RaftIndexIVFFlat( - GpuResourcesProvider* provider, - const faiss::IndexIVFFlat* index, - GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); - - - /// Constructs a new instance with an empty flat quantizer; the user - /// provides the number of lists desired. - RaftIndexIVFFlat( - GpuResourcesProvider* provider, - int dims, - int nlist, - faiss::MetricType metric, - GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); - - /// Constructs a new instance with a provided CPU or GPU coarse quantizer; - /// the user provides the number of IVF lists desired. - RaftIndexIVFFlat( - GpuResourcesProvider* provider, - Index* coarseQuantizer, - int dims, - int nlist, - faiss::MetricType metric = faiss::METRIC_L2, - GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig()); - - ~RaftIndexIVFFlat() override; - - /// Clears out all inverted lists, but retains the coarse centroid - /// information - void reset() override; - - /// Trains the coarse quantizer based on the given vector data - void train(Index::idx_t n, const float* x) override; - - /// Returns the number of vectors present in a particular inverted list - int getListLength(int listId) const override; - - /// Reserve GPU memory in our inverted lists for this number of vectors - void reserveMemory(size_t numVecs); - - /// After adding vectors, one can call this to reclaim device memory - /// to exactly the amount needed. Returns space reclaimed in bytes - size_t reclaimMemory(); - - void copyFrom(const faiss::IndexIVFFlat* index); - - /// Return the encoded vector data contained in a particular inverted list, - /// for debugging purposes. - /// If gpuFormat is true, the data is returned as it is encoded in the - /// GPU-side representation. - /// Otherwise, it is converted to the CPU format. - /// compliant format, while the native GPU format may differ. - std::vector getListVectorData(int listId, bool gpuFormat = false) - const override; - - /// Return the vector indices contained in a particular inverted list, for - /// debugging purposes. - std::vector getListIndices(int listId) const override; - - protected: - /// Called from GpuIndex for add/add_with_ids - void addImpl_(int n, const float* x, const Index::idx_t* ids) override; - - - /// Called from GpuIndex for search - void searchImpl_( - int n, - const float* x, - int k, - float* distances, - Index::idx_t* labels, - const SearchParameters *params) const override; - - void rebuildRaftIndex(const float* x, Index::idx_t n_rows); - - const raft::handle_t raft_handle; - std::optional> raft_knn_index{std::nullopt}; -}; - -} // namespace gpu -} // namespace faiss From 5ab762bfa81dec4b908b8d772580bae522903bf5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 17 Nov 2022 16:26:51 -0500 Subject: [PATCH 44/87] Using current raft 22.12 --- cmake/thirdparty/fetch_rapids.cmake | 2 +- cmake/thirdparty/get_raft.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake index 0befc2fd5d..69460abe4a 100644 --- a/cmake/thirdparty/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(RAPIDS_VERSION "22.10") +set(RAPIDS_VERSION "22.12") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index a7ef8410da..91f53b0f4d 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -16,8 +16,8 @@ set(RAFT_VERSION "${RAPIDS_VERSION}") -set(RAFT_FORK "achirkin") -set(RAFT_PINNED_TAG "fea-ivf-flat-optional-adaptive-centers") +set(RAFT_FORK "rapidsai") +set(RAFT_PINNED_TAG "branch-${RAPIDS_VERSION}") function(find_and_configure_raft) set(oneValueArgs VERSION FORK PINNED_TAG) From 3684cd327d33b11a40281b46acb89dd8328ff40e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 30 Nov 2022 16:12:01 -0500 Subject: [PATCH 45/87] Checking in a little cleanup --- faiss/gpu/impl/RaftIVFFlat.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index be40a65ca6..f20600dc73 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -145,8 +145,7 @@ int RaftIVFFlat::getListLength(int listId) const { return int(size); } -/// Return the list indices of a par -/// ticular list back to the CPU +/// Return the list indices of a particular list back to the CPU std::vector RaftIVFFlat::getListIndices(int listId) const { printf("Inside RaftIVFFlat getListIndices\n"); @@ -221,8 +220,9 @@ void RaftIVFFlat::searchPreassigned( void RaftIVFFlat::updateQuantizer(Index* quantizer) { Index::idx_t quantizer_ntotal = quantizer->ntotal; - std::cout << "Calling updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; - auto stream = resources_->getRaftHandleCurrentDevice().get_stream(); + std::cout << "Calling RAFT updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; + const raft::handle_t &handle = resources->getRaftHandleCurrentDevice(); + auto stream = handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); @@ -230,16 +230,18 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { switch (this->metric_) { case faiss::METRIC_L2: + printf("Using L2!\n"); pams.metric = raft::distance::DistanceType::L2Expanded; break; case faiss::METRIC_INNER_PRODUCT: + printf("Using Inner product!\n"); pams.metric = raft::distance::DistanceType::InnerProduct; break; default: FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(resources_->getRaftHandleCurrentDevice(), pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_); + raft_knn_index.emplace(handle, pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_); printf("Reconstructing\n"); // Copy (reconstructed) centroids over, rather than re-training @@ -249,9 +251,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { printf("Copying...\n"); - raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); + auto knn_index = raft_knn_index.value(); - raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout); + raft::update_device(knn_index.centers().data_handle(), buf_host.data(), total_elems, stream); + + raft::print_device_vector("raft centers", knn_index.centers().data_handle(), this->dim_, std::cout); } From 35a46b29b7bf29653a2862e0b95dbc53e168e388 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 9 Jan 2023 15:59:22 -0500 Subject: [PATCH 46/87] Disabling raft from pulling in nn dependencies (e.g. faiss) --- cmake/thirdparty/get_raft.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 91f53b0f4d..1286aee10f 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -39,6 +39,7 @@ function(find_and_configure_raft) "BUILD_TESTS OFF" "BUILD_BENCH OFF" "RAFT_COMPILE_LIBRARIES OFF" + "RAFT_ENABLE_NN_DEPENDENCIES OFF" ) endfunction() From e7bf2e5119496045868c31254c6d6124326d4a8c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 11 Jan 2023 18:44:13 -0500 Subject: [PATCH 47/87] Updating raft for 23.02. Still working on failing tests. --- cmake/thirdparty/fetch_rapids.cmake | 2 +- faiss/gpu/GpuIndexIVF.cu | 2 +- faiss/gpu/impl/RaftIVFFlat.cu | 36 ++++++++++++++--------------- faiss/gpu/impl/RaftIVFFlat.cuh | 12 +++++----- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake index 69460abe4a..2e14ceac5b 100644 --- a/cmake/thirdparty/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(RAPIDS_VERSION "22.12") +set(RAPIDS_VERSION "23.02") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 116a6844ef..c35667e4a9 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -475,7 +475,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { raft_idx_params.kmeans_n_iters = 100; auto raft_index = raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (Index::idx_t)d); + raft_handle, raft_idx_params, x, n, (idx_t)d); raft_handle.sync_stream(); diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index f20600dc73..483af26521 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -67,7 +67,7 @@ void RaftIVFFlat::search( int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) { + Tensor& outIndices) { printf("Inside RaftIVFFlat search()\n"); // TODO: We probably don't want to ignore the coarse quantizer here... @@ -86,9 +86,9 @@ void RaftIVFFlat::search( pams.n_probes = nprobe; auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); - auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); + auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); - raft::neighbors::ivf_flat::search( + raft::neighbors::ivf_flat::search( raft_handle, *raft_knn_index, queries_view, out_inds_view, out_dists_view, pams, k_); @@ -102,11 +102,11 @@ void RaftIVFFlat::search( int RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, - Tensor& indices) { + Tensor& indices) { printf("Inside RaftIVFFlat addVectors()\n"); - auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); - auto inds_view = raft::make_device_vector_view(indices.data(), (Index::idx_t )indices.getSize(0)); + auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); + auto inds_view = raft::make_device_vector_view(indices.data(), (idx_t )indices.getSize(0)); const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); @@ -118,7 +118,7 @@ int RaftIVFFlat::addVectors( raft_handle, raft_knn_index.value(), vecs_view, - std::make_optional>(inds_view))); + std::make_optional>(inds_view))); } else { printf("Index has not been trained!\n"); @@ -146,21 +146,21 @@ int RaftIVFFlat::getListLength(int listId) const { } /// Return the list indices of a particular list back to the CPU -std::vector RaftIVFFlat::getListIndices(int listId) const { +std::vector RaftIVFFlat::getListIndices(int listId) const { printf("Inside RaftIVFFlat getListIndices\n"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); - Index::idx_t offset; + idx_t offset; uint32_t size; raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream()); raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); raft_handle.sync_stream(); - std::vector vec(size); + std::vector vec(size); raft::copy( vec.data(), raft_knn_index.value().indices().data_handle() + offset, @@ -181,7 +181,7 @@ std::vector RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) using elem_t = decltype(raft_knn_index.value().data())::element_type; size_t dim = raft_knn_index.value().dim(); - Index::idx_t offsets[2]; + idx_t offsets[2]; raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream()); raft_handle.sync_stream(); @@ -207,10 +207,10 @@ void RaftIVFFlat::searchPreassigned( Index* coarseQuantizer, Tensor& vecs, Tensor& ivfDistances, - Tensor& ivfAssignments, + Tensor& ivfAssignments, int k, Tensor& outDistances, - Tensor& outIndices, + Tensor& outIndices, bool storePairs) { printf("Inside RaftIVFFlat searchPreassigned\n"); @@ -218,10 +218,10 @@ void RaftIVFFlat::searchPreassigned( } void RaftIVFFlat::updateQuantizer(Index* quantizer) { - Index::idx_t quantizer_ntotal = quantizer->ntotal; + idx_t quantizer_ntotal = quantizer->ntotal; std::cout << "Calling RAFT updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; - const raft::handle_t &handle = resources->getRaftHandleCurrentDevice(); + const raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); auto stream = handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); @@ -251,11 +251,9 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { printf("Copying...\n"); - auto knn_index = raft_knn_index.value(); + raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - raft::update_device(knn_index.centers().data_handle(), buf_host.data(), total_elems, stream); - - raft::print_device_vector("raft centers", knn_index.centers().data_handle(), this->dim_, std::cout); + raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout); } diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 298a9370c9..968f8fd727 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -43,7 +43,7 @@ class RaftIVFFlat : public IVFFlat { int nprobe, int k, Tensor& outDistances, - Tensor& outIndices) override; + Tensor& outIndices) override; /// Performs search when we are already given the IVF cells to look at /// (GpuIndexIVF::search_preassigned implementation) @@ -51,10 +51,10 @@ class RaftIVFFlat : public IVFFlat { Index* coarseQuantizer, Tensor& vecs, Tensor& ivfDistances, - Tensor& ivfAssignments, + Tensor& ivfAssignments, int k, Tensor& outDistances, - Tensor& outIndices, + Tensor& outIndices, bool storePairs) override; /// Classify and encode/add vectors to our IVF lists. @@ -64,7 +64,7 @@ class RaftIVFFlat : public IVFFlat { int addVectors( Index* coarseQuantizer, Tensor& vecs, - Tensor& indices) override; + Tensor& indices) override; /// Clear out all inverted lists, but retain the coarse quantizer /// and the product quantizer info @@ -75,7 +75,7 @@ class RaftIVFFlat : public IVFFlat { int getListLength(int listId) const override; /// Return the list indices of a particular list back to the CPU - std::vector getListIndices(int listId) const override; + std::vector getListIndices(int listId) const override; /// Return the encoded vectors of a particular list back to the CPU std::vector getListVectorData(int listId, bool gpuFormat) const override; @@ -102,7 +102,7 @@ class RaftIVFFlat : public IVFFlat { // size_t numVecs) override; - std::optional> raft_knn_index{std::nullopt}; + std::optional> raft_knn_index{std::nullopt}; }; From a8e2ad06a3ba54da32972c09793c0353b6550743 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 11 Jan 2023 20:34:05 -0500 Subject: [PATCH 48/87] Isolating differences in results- it looks like it's related to the selection of the probes --- faiss/gpu/GpuIndexIVFFlat.cu | 7 +++++++ faiss/gpu/impl/RaftIVFFlat.cu | 7 +++++++ faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 11 ++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index b99377cd7e..620a409660 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -166,6 +166,13 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal); std::vector buf_host(index->ntotal * index->d); index->reconstruct_n(0, index->ntotal, buf_host.data()); + + printf("reconstructed vectors: ["); + for(int i = 0; i < 50; ++i) { + printf("%f, ", buf_host[i]); + } + printf("]\n"); + add(index->ntotal, buf_host.data()); } } else { diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 483af26521..0b94b6e5db 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -31,6 +31,8 @@ #include #include +#include + namespace faiss { namespace gpu { @@ -105,6 +107,8 @@ int RaftIVFFlat::addVectors( Tensor& indices) { printf("Inside RaftIVFFlat addVectors()\n"); + raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout); + auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); auto inds_view = raft::make_device_vector_view(indices.data(), (idx_t )indices.getSize(0)); @@ -226,7 +230,10 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); + raft::logger::get().set_level(RAFT_LEVEL_TRACE); + raft::spatial::knn::ivf_flat::index_params pams; + pams.add_data_on_build = false; switch (this->metric_) { case faiss::METRIC_L2: diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index e026abd85a..6ffd2cceca 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -27,7 +27,7 @@ struct Options { numCentroids = std::sqrt((float)numAdd / 2); numTrain = numCentroids * 40; - nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); + nprobe = std::min(numCentroids, int(numCentroids / 2) + 10);//faiss::gpu::randVal(std::min(50, numCentroids), numCentroids); numQuery = faiss::gpu::randVal(32, 100); // Due to the approximate nature of the query and of floating point @@ -68,6 +68,7 @@ void queryTest( faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { for (int tries = 0; tries < 2; ++tries) { + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -96,8 +97,6 @@ void queryTest( faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); gpuIndex.copyFrom(&cpuIndex); - - gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; @@ -125,6 +124,12 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + printf("original add vectors: ["); + for(int i = 0; i < 50; ++i) { + printf("%f, ", addVecs[i]); + } + printf("]\n"); + faiss::IndexFlatL2 quantizerL2(opt.dim); faiss::IndexFlatIP quantizerIP(opt.dim); faiss::Index* quantizer = metricType == faiss::METRIC_L2 From f19fd00cc7700ebf4027c01851c0b6e469841149 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 19 Jan 2023 12:24:12 -0500 Subject: [PATCH 49/87] Add and query results appear to match well. LargeBatch tests are failing but RAFT seems to have better recall. Still investigating. --- faiss/gpu/GpuIndexIVFFlat.cu | 32 ++++++++++++-------------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 2 ++ faiss/gpu/test/TestUtils.cpp | 4 +--- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 620a409660..45019bb5aa 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -103,6 +103,7 @@ void GpuIndexIVFFlat::set_index_(GpuResources* resources, IndicesOptions indicesOptions, MemorySpace space) { if(config_.use_raft) { + printf("Setting RaftIVFFlat index\n"); index_.reset(new RaftIVFFlat( resources, dim, nlist, metric, metricArg, useResidual, scalarQ, interleavedLayout, indicesOptions, space)); @@ -139,17 +140,17 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // The other index might not be trained if (!index->is_trained) { - FAISS_ASSERT(!this->is_trained); + FAISS_ASSERT(!is_trained); return; } // Otherwise, we can populate ourselves from the other index - FAISS_ASSERT(this->is_trained); + FAISS_ASSERT(is_trained); // Copy our lists as well set_index_(resources_.get(), - this->d, - this->nlist, + d, + nlist, index->metric_type, index->metric_arg, false, // no residual @@ -158,25 +159,22 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.indicesOptions, config_.memorySpace); - if(config_.use_raft) { + printf("Reconstructing %d original vectors and adding to GPU index\n", ntotal); + // Quantizer should already have been updated above. Add reconstructed vectors to raft index - if(index->ntotal > 0) { - printf("Reconstructing %d original vectors and adding to GPU index\n", index->ntotal); - std::vector buf_host(index->ntotal * index->d); - index->reconstruct_n(0, index->ntotal, buf_host.data()); - - printf("reconstructed vectors: ["); - for(int i = 0; i < 50; ++i) { - printf("%f, ", buf_host[i]); - } - printf("]\n"); - - add(index->ntotal, buf_host.data()); + if(ntotal > 0) { + std::vector buf_host(ntotal * d); + std::vector ids(ntotal); + std::iota(ids.begin(), ids.end(), 0); + index->reconstruct_n(0, ntotal, buf_host.data()); + add_with_ids(ntotal, buf_host.data(), ids.data()); } } else { + // Copy all of the IVF data + printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); index_->copyInvertedListsFrom(index->invlists); } } diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 6ffd2cceca..a7d1e8f18f 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -100,6 +100,8 @@ void queryTest( gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; + + printf("Use float16: %d\n", compFloat16); faiss::gpu::compareIndices( cpuIndex, gpuIndex, diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp index 65e36dcc31..a02618c4e4 100644 --- a/faiss/gpu/test/TestUtils.cpp +++ b/faiss/gpu/test/TestUtils.cpp @@ -114,9 +114,7 @@ void compareIndices( testDistance.data(), testIndices.data()); - int idx = 4; - - int start_idx = idx * numQuery; + int start_idx = 17 * k; int stop_idx = start_idx + k; printf("ref inds: ["); for(int i = start_idx; i < stop_idx; i++) { From 6269ed1c63a782a4fe657b473331dd9c9d8be423 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 24 Jan 2023 14:58:05 -0500 Subject: [PATCH 50/87] Using facebook for licenses in cmake files --- cmake/thirdparty/fetch_rapids.cmake | 15 +++------------ cmake/thirdparty/get_raft.cmake | 18 +++--------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake index 2e14ceac5b..f9405b9f22 100644 --- a/cmake/thirdparty/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -1,16 +1,7 @@ -# ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) Facebook, Inc. and its affiliates. # -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. set(RAPIDS_VERSION "23.02") diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 1286aee10f..0f12db58ac 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -1,19 +1,7 @@ -#============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) Facebook, Inc. and its affiliates. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. set(RAFT_VERSION "${RAPIDS_VERSION}") set(RAFT_FORK "rapidsai") From b13593af5aa1fa38cc5089b2de7205ebc0c2ba71 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 24 Jan 2023 14:59:36 -0500 Subject: [PATCH 51/87] Adding small note to build.sh that the file is temporary. --- build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.sh b/build.sh index 80341ebcfd..5a0c3c58da 100755 --- a/build.sh +++ b/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged + BUILD_TYPE=Release BUILD_DIR=build/ From bc8885dd33405901e2abb3739e874e3820d9d78b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 31 Jan 2023 12:56:46 -0500 Subject: [PATCH 52/87] Fixing style --- faiss/gpu/GpuIndexFlat.cu | 4 +- faiss/gpu/GpuIndexIVF.h | 5 +- faiss/gpu/GpuIndexIVFFlat.h | 26 ++--- faiss/gpu/GpuResources.cpp | 2 +- faiss/gpu/GpuResources.h | 4 +- faiss/gpu/StandardGpuResources.h | 4 +- faiss/gpu/impl/RaftFlatIndex.cu | 66 +++++++----- faiss/gpu/impl/RaftFlatIndex.cuh | 7 +- faiss/gpu/impl/RaftIVFFlat.cu | 144 ++++++++++++++++--------- faiss/gpu/impl/RaftIVFFlat.cuh | 45 ++++---- faiss/gpu/test/TestGpuIndexFlat.cpp | 4 +- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 9 +- faiss/gpu/test/TestUtils.cpp | 9 +- 13 files changed, 196 insertions(+), 133 deletions(-) diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index 44ffbe6fce..174005d7e1 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -89,8 +89,7 @@ GpuIndexFlat::GpuIndexFlat( GpuIndexFlat::~GpuIndexFlat() {} void GpuIndexFlat::resetIndex_(int dims) { - - if(config_.use_raft) { + if (config_.use_raft) { printf("Should use raft!\n"); data_.reset(new RaftFlatIndex( resources_.get(), @@ -108,7 +107,6 @@ void GpuIndexFlat::resetIndex_(int dims) { } } - void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) { DeviceScope scope(config_.device); diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index 6c6efbb888..fa4e2e0845 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -86,8 +86,9 @@ class GpuIndexIVF : public GpuIndex { /// GPU-side representation. /// Otherwise, it is converted to the CPU format. /// compliant format, while the native GPU format may differ. - virtual std::vector getListVectorData(int listId, bool gpuFormat = false) - const; + virtual std::vector getListVectorData( + int listId, + bool gpuFormat = false) const; /// Return the vector indices contained in a particular inverted list, for /// debugging purposes. diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index a0ca24ec97..5bfdf17a83 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -7,8 +7,8 @@ #pragma once -#include #include +#include #include @@ -90,18 +90,18 @@ class GpuIndexIVFFlat : public GpuIndexIVF { void train(idx_t n, const float* x) override; protected: - - void set_index_(GpuResources* resources, - int dim, - int nlist, - faiss::MetricType metric, - float metricArg, - bool useResidual, - /// Optional ScalarQuantizer - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space); + void set_index_( + GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space); /// Our configuration options const GpuIndexIVFFlatConfig ivfFlatConfig_; diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp index 0129ddafd4..970407e828 100644 --- a/faiss/gpu/GpuResources.cpp +++ b/faiss/gpu/GpuResources.cpp @@ -153,7 +153,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() { return getDefaultStream(getCurrentDevice()); } -raft::handle_t &GpuResources::getRaftHandleCurrentDevice() { +raft::handle_t& GpuResources::getRaftHandleCurrentDevice() { return getRaftHandle(getCurrentDevice()); } diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index c286fbae82..d5a4939136 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -194,8 +194,8 @@ class GpuResources { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - virtual raft::handle_t &getRaftHandle(int device) = 0; - raft::handle_t &getRaftHandleCurrentDevice(); + virtual raft::handle_t& getRaftHandle(int device) = 0; + raft::handle_t& getRaftHandleCurrentDevice(); /// Overrides the default stream for a device to the user-supplied stream. /// The resources object does not own this stream (i.e., it will not destroy diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 672f1b8339..ad2b371476 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -62,7 +62,7 @@ class StandardGpuResourcesImpl : public GpuResources { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - raft::handle_t &getRaftHandle(int device) override; + raft::handle_t& getRaftHandle(int device) override; /// Called to change the work ordering streams to the null stream /// for all devices @@ -197,7 +197,7 @@ class StandardGpuResources : public GpuResourcesProvider { /// Returns the raft handle for the given device which can be used to /// make calls to other raft primitives. - raft::handle_t &getRaftHandle(int device); + raft::handle_t& getRaftHandle(int device); /// Returns the current amount of temp memory available size_t getTempMemoryAvailable(int device) const; diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index f0283e2a00..b1254a96a3 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -32,38 +32,54 @@ void RaftFlatIndex::query( Tensor& outDistances, Tensor& outIndices, bool exactDistance) { - // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used - if(k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) { - raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + if (k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) { + raft::handle_t& raft_handle = resources_->getRaftHandleCurrentDevice(); - auto distance = exactDistance ? raft::distance::DistanceType::L2Unexpanded : - raft::distance::DistanceType::L2Expanded; + auto distance = exactDistance + ? raft::distance::DistanceType::L2Unexpanded + : raft::distance::DistanceType::L2Expanded; - auto index = raft::make_device_matrix_view(vectors_.data(), vectors_.getSize(0), vectors_.getSize(1)); - auto search = raft::make_device_matrix_view(input.data(), input.getSize(0), input.getSize(1)); - auto inds = raft::make_device_matrix_view(outIndices.data(), outIndices.getSize(0), outIndices.getSize(1)); - auto dists = raft::make_device_matrix_view(outDistances.data(), outDistances.getSize(0), outDistances.getSize(1)); + auto index = raft::make_device_matrix_view( + vectors_.data(), vectors_.getSize(0), vectors_.getSize(1)); + auto search = raft::make_device_matrix_view( + input.data(), input.getSize(0), input.getSize(1)); + auto inds = raft::make_device_matrix_view( + outIndices.data(), + outIndices.getSize(0), + outIndices.getSize(1)); + auto dists = raft::make_device_matrix_view( + outDistances.data(), + outDistances.getSize(0), + outDistances.getSize(1)); -// raft::neighbors::brute_force::knn(raft_handle, index, search, inds, dists, k, distance); + // raft::neighbors::brute_force::knn(raft_handle, index, search, + // inds, dists, k, distance); // TODO: Expose the fused L2KNN through RAFT's public APIs - raft::spatial::knn::detail::fusedL2Knn(dim_, - inds.data_handle(), - dists.data_handle(), - index.data_handle(), - search.data_handle(), - index.extent(0), - search.extent(0), - k, - true, - true, - raft_handle.get_stream(), - distance); - - } else { + raft::spatial::knn::detail::fusedL2Knn( + dim_, + inds.data_handle(), + dists.data_handle(), + index.data_handle(), + search.data_handle(), + index.extent(0), + search.extent(0), + k, + true, + true, + raft_handle.get_stream(), + distance); - FlatIndex::query(input, k, metric, metricArg, outDistances, outIndices, exactDistance); + } else { + FlatIndex::query( + input, + k, + metric, + metricArg, + outDistances, + outIndices, + exactDistance); } } } // namespace gpu diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh index ad48102254..ed4f2572e0 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cuh +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -25,7 +25,11 @@ class GpuResources; /// data is possibly needed for certain residual operations class RaftFlatIndex : public FlatIndex { public: - RaftFlatIndex(GpuResources* res, int dim, bool useFloat16, MemorySpace space); + RaftFlatIndex( + GpuResources* res, + int dim, + bool useFloat16, + MemorySpace space); void query( Tensor& vecs, @@ -35,7 +39,6 @@ class RaftFlatIndex : public FlatIndex { Tensor& outDistances, Tensor& outIndices, bool exactDistance) override; - }; } // namespace gpu diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 0b94b6e5db..cbe3e58a7c 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -5,9 +5,9 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include #include +#include +#include #include #include @@ -16,12 +16,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include #include @@ -56,11 +56,10 @@ RaftIVFFlat::RaftIVFFlat( scalarQ, interleavedLayout, indicesOptions, - space){} + space) {} RaftIVFFlat::~RaftIVFFlat() {} - /// Find the approximate k nearest neighbors for `queries` against /// our database void RaftIVFFlat::search( @@ -83,16 +82,25 @@ void RaftIVFFlat::search( FAISS_ASSERT(n > 0); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::search_params pams; pams.n_probes = nprobe; - auto queries_view = raft::make_device_matrix_view(queries.data(), n, cols); - auto out_inds_view = raft::make_device_matrix_view(outIndices.data(), n, k_); - auto out_dists_view = raft::make_device_matrix_view(outDistances.data(), n, k_); + auto queries_view = + raft::make_device_matrix_view(queries.data(), n, cols); + auto out_inds_view = + raft::make_device_matrix_view(outIndices.data(), n, k_); + auto out_dists_view = + raft::make_device_matrix_view(outDistances.data(), n, k_); raft::neighbors::ivf_flat::search( - raft_handle, *raft_knn_index, queries_view, - out_inds_view, out_dists_view, pams, k_); + raft_handle, + *raft_knn_index, + queries_view, + out_inds_view, + out_dists_view, + pams, + k_); raft_handle.sync_stream(); } @@ -109,20 +117,25 @@ int RaftIVFFlat::addVectors( raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout); - auto vecs_view = raft::make_device_matrix_view(vecs.data(), vecs.getSize(0), dim_); - auto inds_view = raft::make_device_vector_view(indices.data(), (idx_t )indices.getSize(0)); + auto vecs_view = raft::make_device_matrix_view( + vecs.data(), vecs.getSize(0), dim_); + auto inds_view = raft::make_device_vector_view( + indices.data(), (idx_t)indices.getSize(0)); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); printf("About to call extend on index\n"); // TODO: We probably don't want to ignore the coarse quantizer here - if(raft_knn_index.has_value()) { + if (raft_knn_index.has_value()) { raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( raft_handle, raft_knn_index.value(), vecs_view, - std::make_optional>(inds_view))); + std::make_optional< + raft::device_vector_view>( + inds_view))); } else { printf("Index has not been trained!\n"); @@ -140,28 +153,40 @@ int RaftIVFFlat::getListLength(int listId) const { printf("Inside RaftIVFFlat getListLength\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); uint32_t size; - raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, - 1, raft_handle.get_stream()); + raft::copy( + &size, + raft_knn_index.value().list_sizes().data_handle() + listId, + 1, + raft_handle.get_stream()); raft_handle.sync_stream(); return int(size); } /// Return the list indices of a particular list back to the CPU std::vector RaftIVFFlat::getListIndices(int listId) const { - printf("Inside RaftIVFFlat getListIndices\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); idx_t offset; uint32_t size; - raft::copy(&offset, raft_knn_index.value().list_offsets().data_handle() + listId, 1, raft_handle.get_stream()); - raft::copy(&size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); + raft::copy( + &offset, + raft_knn_index.value().list_offsets().data_handle() + listId, + 1, + raft_handle.get_stream()); + raft::copy( + &size, + raft_knn_index.value().list_sizes().data_handle() + listId, + 1, + raft_handle.get_stream()); raft_handle.sync_stream(); std::vector vec(size); @@ -174,32 +199,36 @@ std::vector RaftIVFFlat::getListIndices(int listId) const { } /// Return the encoded vectors of a particular list back to the CPU -std::vector RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) const { - +std::vector RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) + const { printf("Inside RaftIVFFlat getListVectorData\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); std::cout << "Calling getListVectorData for " << listId << std::endl; using elem_t = decltype(raft_knn_index.value().data())::element_type; size_t dim = raft_knn_index.value().dim(); idx_t offsets[2]; - raft::copy(offsets, raft_knn_index.value().list_offsets().data_handle() + listId, 2, raft_handle.get_stream()); + raft::copy( + offsets, + raft_knn_index.value().list_offsets().data_handle() + listId, + 2, + raft_handle.get_stream()); raft_handle.sync_stream(); size_t byte_offset = offsets[0] * sizeof(elem_t) * dim; // the interleaved block can be slightly larger than the list size (it's // rounded up) - size_t byte_size = size_t(offsets[1]) * - sizeof(elem_t) * dim - - byte_offset; + size_t byte_size = size_t(offsets[1]) * sizeof(elem_t) * dim - byte_offset; std::vector vec(byte_size); raft::copy( vec.data(), - reinterpret_cast(raft_knn_index.value().data().data_handle()) + - byte_offset, + reinterpret_cast( + raft_knn_index.value().data().data_handle()) + + byte_offset, byte_size, raft_handle.get_stream()); return vec; @@ -224,8 +253,9 @@ void RaftIVFFlat::searchPreassigned( void RaftIVFFlat::updateQuantizer(Index* quantizer) { idx_t quantizer_ntotal = quantizer->ntotal; - std::cout << "Calling RAFT updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; - const raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); + std::cout << "Calling RAFT updateQuantizer with trained index with " + << quantizer_ntotal << " items" << std::endl; + const raft::handle_t& handle = resources_->getRaftHandleCurrentDevice(); auto stream = handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); @@ -248,7 +278,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(handle, pams.metric, (uint32_t)this->numLists_, false, (uint32_t)this->dim_); + raft_knn_index.emplace( + handle, + pams.metric, + (uint32_t)this->numLists_, + false, + (uint32_t)this->dim_); printf("Reconstructing\n"); // Copy (reconstructed) centroids over, rather than re-training @@ -258,15 +293,22 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { printf("Copying...\n"); - raft::update_device(raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - - raft::print_device_vector("raft centers", raft_knn_index.value().centers().data_handle(), this->dim_, std::cout); + raft::update_device( + raft_knn_index.value().centers().data_handle(), + buf_host.data(), + total_elems, + stream); + + raft::print_device_vector( + "raft centers", + raft_knn_index.value().centers().data_handle(), + this->dim_, + std::cout); } - // // -//void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { +// void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { // size_t nlist = ivf ? ivf->nlist : 0; // size_t ntotal = ivf ? ivf->compute_ntotal() : 0; // @@ -303,12 +345,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // i, ivf->get_codes(i), ivf->get_ids(i), listSize); // } // -// raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, handle.get_stream()); -// raft::update_device(raft_knn_index.value().list_offsets().data_handle(), list_offsets_.data(), nlist+1, handle.get_stream()); +// raft::update_device(raft_knn_index.value().list_sizes().data_handle(), +// list_sizes_.data(), nlist, handle.get_stream()); +// raft::update_device(raft_knn_index.value().list_offsets().data_handle(), +// list_offsets_.data(), nlist+1, handle.get_stream()); // //} -//void RaftIVFFlat::addEncodedVectorsToList_( +// void RaftIVFFlat::addEncodedVectorsToList_( // int listId, // const void* codes, // const Index::idx_t* indices, @@ -334,16 +378,20 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // // // We only have int32 length representations on the GPU per each // // list; the length is in sizeof(char) -// FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); +// FAISS_ASSERT(gpuListSizeInBytes <= +// (size_t)std::numeric_limits::max()); // // // Translate the codes as needed to our preferred form // std::vector codesV(cpuListSizeInBytes); // std::memcpy(codesV.data(), codes, cpuListSizeInBytes); // auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); // -// std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << gpuListSizeInBytes << std::endl; +// std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << +// gpuListSizeInBytes << std::endl; // -//// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), translatedCodes.data(), )) +//// +///RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), +///translatedCodes.data(), )) // //// listCodes->data.append( //// translatedCodes.data(), @@ -365,14 +413,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { //// maxListLength_ = std::max(maxListLength_, (int)numVecs); //} - ///// Copy all inverted lists from ourselves to a CPU representation -//void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { +// void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { // printf("Inside RaftIVFFlat copyInvertedListsTo\n"); // // // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu //} - } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 968f8fd727..13bd0544e7 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -20,7 +20,8 @@ namespace gpu { class RaftIVFFlat : public IVFFlat { public: - RaftIVFFlat(GpuResources* resources, + RaftIVFFlat( + GpuResources* resources, int dim, int nlist, faiss::MetricType metric, @@ -34,7 +35,6 @@ class RaftIVFFlat : public IVFFlat { ~RaftIVFFlat() override; - /// Find the approximate k nearest neigbors for `queries` against /// our database void search( @@ -78,32 +78,31 @@ class RaftIVFFlat : public IVFFlat { std::vector getListIndices(int listId) const override; /// Return the encoded vectors of a particular list back to the CPU - std::vector getListVectorData(int listId, bool gpuFormat) const override; + std::vector getListVectorData(int listId, bool gpuFormat) + const override; void updateQuantizer(Index* quantizer) override; -// -// /// Copy all inverted lists from a CPU representation to ourselves -// void copyInvertedListsFrom(const InvertedLists* ivf) override; -// -// /// Copy all inverted lists from ourselves to a CPU representation -// void copyInvertedListsTo(InvertedLists* ivf) override; + // + // /// Copy all inverted lists from a CPU representation to ourselves + // void copyInvertedListsFrom(const InvertedLists* ivf) override; + // + // /// Copy all inverted lists from ourselves to a CPU representation + // void copyInvertedListsTo(InvertedLists* ivf) override; protected: - -// /// Adds a set of codes and indices to a list, with the representation -// /// coming from the CPU equivalent -// void addEncodedVectorsToList_( -// int listId, -// // resident on the host -// const void* codes, -// // resident on the host -// const Index::idx_t* indices, -// size_t numVecs) override; - - - std::optional> raft_knn_index{std::nullopt}; - + // /// Adds a set of codes and indices to a list, with the representation + // /// coming from the CPU equivalent + // void addEncodedVectorsToList_( + // int listId, + // // resident on the host + // const void* codes, + // // resident on the host + // const Index::idx_t* indices, + // size_t numVecs) override; + + std::optional> + raft_knn_index{std::nullopt}; }; } // namespace gpu diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index 6c806c7881..7e760551e7 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -266,7 +266,7 @@ TEST(TestGpuIndexFlat, CopyFrom) { for (bool useFloat16 : {false, true}) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; - config.use_raft = true; + config.use_raft = true; config.useFloat16 = useFloat16; // Fill with garbage values @@ -309,7 +309,7 @@ TEST(TestGpuIndexFlat, CopyTo) { for (bool useFloat16 : {false, true}) { faiss::gpu::GpuIndexFlatConfig config; config.device = device; - config.use_raft = true; + config.use_raft = true; config.useFloat16 = useFloat16; faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index a7d1e8f18f..6b98743d90 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -27,7 +27,10 @@ struct Options { numCentroids = std::sqrt((float)numAdd / 2); numTrain = numCentroids * 40; - nprobe = std::min(numCentroids, int(numCentroids / 2) + 10);//faiss::gpu::randVal(std::min(50, numCentroids), numCentroids); + nprobe = std::min( + numCentroids, + int(numCentroids / 2) + 10); // faiss::gpu::randVal(std::min(50, + // numCentroids), numCentroids); numQuery = faiss::gpu::randVal(32, 100); // Due to the approximate nature of the query and of floating point @@ -68,7 +71,6 @@ void queryTest( faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { for (int tries = 0; tries < 2; ++tries) { - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -127,7 +129,7 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); printf("original add vectors: ["); - for(int i = 0; i < 50; ++i) { + for (int i = 0; i < 50; ++i) { printf("%f, ", addVecs[i]); } printf("]\n"); @@ -187,7 +189,6 @@ void copyToTest(bool useFloat16CoarseQuantizer) { config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); gpuIndex.train(opt.numTrain, trainVecs.data()); diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp index a02618c4e4..04f136782c 100644 --- a/faiss/gpu/test/TestUtils.cpp +++ b/faiss/gpu/test/TestUtils.cpp @@ -117,30 +117,29 @@ void compareIndices( int start_idx = 17 * k; int stop_idx = start_idx + k; printf("ref inds: ["); - for(int i = start_idx; i < stop_idx; i++) { + for (int i = start_idx; i < stop_idx; i++) { printf("%d, ", int(refIndices[i])); } printf("]\n"); printf("test inds: ["); - for(int i = start_idx; i < stop_idx; i++) { + for (int i = start_idx; i < stop_idx; i++) { printf("%d, ", int(testIndices[i])); } printf("]\n"); printf("ref dists: ["); - for(int i = start_idx; i < stop_idx; i++) { + for (int i = start_idx; i < stop_idx; i++) { printf("%f, ", float(refDistance[i])); } printf("]\n"); printf("test dists: ["); - for(int i = start_idx; i < stop_idx; i++) { + for (int i = start_idx; i < stop_idx; i++) { printf("%f, ", float(testDistance[i])); } printf("]\n"); - faiss::gpu::compareLists( refDistance.data(), refIndices.data(), From 19f38d4557b264a3f7856eed5b97d486afe5c746 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 31 Jan 2023 13:04:29 -0500 Subject: [PATCH 53/87] Second pass of fixing formatting --- faiss/gpu/GpuIndexFlat.h | 1 - faiss/gpu/GpuIndexIVF.cu | 8 +-- faiss/gpu/GpuIndexIVFFlat.cu | 102 +++++++++++++++++------------ faiss/gpu/StandardGpuResources.cpp | 5 +- faiss/gpu/impl/IVFBase.cuh | 3 +- faiss/gpu/impl/RaftIVFFlat.cu | 4 +- 6 files changed, 71 insertions(+), 52 deletions(-) diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h index 084bacd4b6..a3c177040b 100644 --- a/faiss/gpu/GpuIndexFlat.h +++ b/faiss/gpu/GpuIndexFlat.h @@ -115,7 +115,6 @@ class GpuIndexFlat : public GpuIndex { } protected: - void resetIndex_(int dims); /// Flat index does not require IDs as there is no storage available for diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index c35667e4a9..6181ba5401 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -463,10 +463,10 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); } - if(config_.use_raft) { - + if (config_.use_raft) { printf("Using raft to train quantizer for %d vectors\n", n); - const raft::handle_t &raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::handle_t& raft_handle = + resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::index_params raft_idx_params; raft_idx_params.n_lists = nlist; @@ -475,7 +475,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { raft_idx_params.kmeans_n_iters = 100; auto raft_index = raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (idx_t)d); + raft_handle, raft_idx_params, x, n, (idx_t)d); raft_handle.sync_stream(); diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 45019bb5aa..7c152cc420 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -5,16 +5,16 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include #include #include #include #include #include #include -#include +#include #include +#include +#include #include #include @@ -74,7 +74,8 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( if (this->is_trained) { FAISS_ASSERT(this->quantizer); - set_index_(resources_.get(), + set_index_( + resources_.get(), this->d, this->nlist, this->metric_type, @@ -91,26 +92,43 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( GpuIndexIVFFlat::~GpuIndexIVFFlat() {} -void GpuIndexIVFFlat::set_index_(GpuResources* resources, - int dim, - int nlist, - faiss::MetricType metric, - float metricArg, - bool useResidual, - /// Optional ScalarQuantizer - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space) { - if(config_.use_raft) { +void GpuIndexIVFFlat::set_index_( + GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) { + if (config_.use_raft) { printf("Setting RaftIVFFlat index\n"); index_.reset(new RaftIVFFlat( - resources, dim, nlist, metric, metricArg, useResidual, - scalarQ, interleavedLayout, indicesOptions, space)); + resources, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space)); } else { index_.reset(new IVFFlat( - resources, dim, nlist, metric, metricArg, useResidual, - scalarQ, interleavedLayout, indicesOptions, space)); + resources, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space)); } baseIndex_ = std::static_pointer_cast(index_); @@ -127,7 +145,6 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - printf("Inside copyFrom\n"); DeviceScope scope(config_.device); @@ -148,7 +165,8 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_ASSERT(is_trained); // Copy our lists as well - set_index_(resources_.get(), + set_index_( + resources_.get(), d, nlist, index->metric_type, @@ -159,12 +177,13 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.indicesOptions, config_.memorySpace); - if(config_.use_raft) { + if (config_.use_raft) { + printf("Reconstructing %d original vectors and adding to GPU index\n", + ntotal); - printf("Reconstructing %d original vectors and adding to GPU index\n", ntotal); - - // Quantizer should already have been updated above. Add reconstructed vectors to raft index - if(ntotal > 0) { + // Quantizer should already have been updated above. Add reconstructed + // vectors to raft index + if (ntotal > 0) { std::vector buf_host(ntotal * d); std::vector ids(ntotal); std::iota(ids.begin(), ids.end(), 0); @@ -172,7 +191,6 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { add_with_ids(ntotal, buf_host.data(), ids.data()); } } else { - // Copy all of the IVF data printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); index_->copyInvertedListsFrom(index->invlists); @@ -251,7 +269,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { FAISS_ASSERT(!index_); // FIXME: GPUize more of this - // First, make sure that the data is resident on the CPU, if it is not on the CPU, as we depend upon parts of the CPU code + // First, make sure that the data is resident on the CPU, if it is not on + // the CPU, as we depend upon parts of the CPU code auto hostData = toHost( (float*)x, resources_->getDefaultStream(config_.device), @@ -260,18 +279,19 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { trainQuantizer_(n, hostData.data()); // The quantizer is now trained; construct the IVF index - set_index_(resources_.get(), - this->d, - this->nlist, - this->metric_type, - this->metric_arg, - false, // no residual - nullptr, // no scalar quantizer - ivfFlatConfig_.interleavedLayout, - ivfFlatConfig_.indicesOptions, - config_.memorySpace); - - if (reserveMemoryVecs_) { + set_index_( + resources_.get(), + this->d, + this->nlist, + this->metric_type, + this->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace); + + if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index c593264ab0..af6c9579b9 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -379,7 +379,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) { return defaultStreams_[device]; } -raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) { +raft::handle_t& StandardGpuResourcesImpl::getRaftHandle(int device) { initializeForDevice(device); auto it = raftHandles_.find(device); @@ -390,7 +390,6 @@ raft::handle_t &StandardGpuResourcesImpl::getRaftHandle(int device) { // Otherwise, our base default handle return raftHandles_[device]; - } std::vector StandardGpuResourcesImpl::getAlternateStreams( @@ -618,7 +617,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) { return res_->getDefaultStream(device); } - raft::handle_t &StandardGpuResources::getRaftHandle(int device) { +raft::handle_t& StandardGpuResources::getRaftHandle(int device) { return res_->getRaftHandle(device); } diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index 537120f769..24e6fb708c 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -65,7 +65,8 @@ class IVFBase { virtual std::vector getListIndices(int listId) const; /// Return the encoded vectors of a particular list back to the CPU - virtual std::vector getListVectorData(int listId, bool gpuFormat) const; + virtual std::vector getListVectorData(int listId, bool gpuFormat) + const; /// Copy all inverted lists from a CPU representation to ourselves virtual void copyInvertedListsFrom(const InvertedLists* ivf); diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index cbe3e58a7c..5d7268497b 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -390,8 +390,8 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // gpuListSizeInBytes << std::endl; // //// -///RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), -///translatedCodes.data(), )) +/// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), +/// translatedCodes.data(), )) // //// listCodes->data.append( //// translatedCodes.data(), From 17df7989221aa24a4c97f730f0bdd8edb480e45b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 31 Jan 2023 13:06:08 -0500 Subject: [PATCH 54/87] Third pass at fixing format style --- faiss/gpu/impl/IVFFlat.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index 4c8ca19552..8997039580 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(int numVecs) const { int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; // bytes to encode a block of 32 vectors (single dimension) - int bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32 + int bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32 // bytes to fully encode 32 vectors int bytesPerBlock = bytesPerDimBlock * dim_; @@ -93,7 +93,9 @@ std::vector IVFFlat::translateCodesToGpu_( bool sc = scalarQ_ ? true : false; int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc << ", bitsPerCode=" << bitsPerCode << ", interleavedLayout_=" << interleavedLayout_ << std::endl; + std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc + << ", bitsPerCode=" << bitsPerCode + << ", interleavedLayout_=" << interleavedLayout_ << std::endl; auto up = unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); @@ -114,7 +116,6 @@ std::vector IVFFlat::translateCodesFromGpu_( return packNonInterleaved(std::move(up), numVecs, dim_, bitsPerCode); } - void IVFFlat::appendVectors_( Tensor& vecs, Tensor& ivfCentroidResiduals, @@ -199,7 +200,6 @@ void IVFFlat::search( makeTempAlloc(AllocType::Other, stream), {queries.getSize(0), nprobe, dim_}); - searchCoarseQuantizer_( coarseQuantizer, nprobe, From 29934410464ecc0d38a8c2e19ea044dcf3a88cb4 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 31 Jan 2023 13:23:37 -0500 Subject: [PATCH 55/87] Adding nvidia license for traceability --- faiss/gpu/GpuIndexIVFFlat.cu | 15 +++++++++++++++ faiss/gpu/GpuIndexIVFFlat.h | 15 +++++++++++++++ faiss/gpu/GpuResources.cpp | 15 +++++++++++++++ faiss/gpu/GpuResources.h | 15 +++++++++++++++ faiss/gpu/StandardGpuResources.cpp | 15 +++++++++++++++ faiss/gpu/StandardGpuResources.h | 15 +++++++++++++++ faiss/gpu/impl/RaftFlatIndex.cu | 15 +++++++++++++++ faiss/gpu/impl/RaftFlatIndex.cuh | 15 +++++++++++++++ faiss/gpu/impl/RaftIVFFlat.cu | 16 +++++++++++++++- faiss/gpu/impl/RaftIVFFlat.cuh | 15 +++++++++++++++ 10 files changed, 150 insertions(+), 1 deletion(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 7c152cc420..f2122d58bf 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index 5bfdf17a83..7b5adf58f7 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp index 970407e828..cc74f2df10 100644 --- a/faiss/gpu/GpuResources.cpp +++ b/faiss/gpu/GpuResources.cpp @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index d5a4939136..dfec38796d 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index af6c9579b9..7e620c7aec 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index ad2b371476..490734addc 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index b1254a96a3..ac9594bdb7 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh index ed4f2572e0..d947f9b21d 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cuh +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 5d7268497b..3820beb2b1 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -4,7 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ - +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include #include #include diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 13bd0544e7..d0c17fd2e9 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -4,6 +4,21 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once From 5e7eb6dccf2b505721d1be1f8bf162984b1d3b95 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 6 Feb 2023 12:07:09 -0500 Subject: [PATCH 56/87] Updates --- faiss/gpu/GpuIndexIVF.cu | 5 ++--- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 8 ++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 6181ba5401..4b4197d556 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -459,9 +459,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { return; } - if (this->verbose) { - printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); - } + printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); if (config_.use_raft) { printf("Using raft to train quantizer for %d vectors\n", n); @@ -472,6 +470,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { raft_idx_params.n_lists = nlist; raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_trainset_fraction = 1.0; raft_idx_params.kmeans_n_iters = 100; auto raft_index = raft::neighbors::ivf_flat::build( diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 6b98743d90..d03a7947b2 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -20,6 +20,7 @@ constexpr float kF16MaxRelErr = 0.3f; constexpr float kF32MaxRelErr = 0.03f; + struct Options { Options() { numAdd = 2 * faiss::gpu::randVal(2000, 5000); @@ -27,10 +28,7 @@ struct Options { numCentroids = std::sqrt((float)numAdd / 2); numTrain = numCentroids * 40; - nprobe = std::min( - numCentroids, - int(numCentroids / 2) + 10); // faiss::gpu::randVal(std::min(50, - // numCentroids), numCentroids); + nprobe = faiss::gpu::randVal(std::min(50, numCentroids), numCentroids); numQuery = faiss::gpu::randVal(32, 100); // Due to the approximate nature of the query and of floating point @@ -102,8 +100,6 @@ void queryTest( gpuIndex.setNumProbes(opt.nprobe); bool compFloat16 = useFloat16CoarseQuantizer; - - printf("Use float16: %d\n", compFloat16); faiss::gpu::compareIndices( cpuIndex, gpuIndex, From ddc75ac77c89a4a3cd6f5e4198d4068dc2a7775a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 7 Feb 2023 16:16:24 -0500 Subject: [PATCH 57/87] Merging --- faiss/gpu/impl/IVFBase.cuh | 2 +- faiss/gpu/impl/RaftFlatIndex.cu | 4 ++-- faiss/gpu/impl/RaftFlatIndex.cuh | 2 +- faiss/gpu/impl/RaftIVFFlat.cu | 8 ++++---- faiss/gpu/impl/RaftIVFFlat.cuh | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index 9fdaace261..2bb319d002 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -59,7 +59,7 @@ class IVFBase { /// For debugging purposes, return the list length of a particular /// list - virtual idx_t getListLength(int listId) const; + virtual idx_t getListLength(idx_t listId) const; /// Return the list indices of a particular list back to the CPU virtual std::vector getListIndices(idx_t listId) const; diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index ac9594bdb7..a506086423 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -45,7 +45,7 @@ void RaftFlatIndex::query( faiss::MetricType metric, float metricArg, Tensor& outDistances, - Tensor& outIndices, + Tensor& outIndices, bool exactDistance) { // For now, use RAFT's fused KNN when k <= 64 and L2 metric is used if (k <= 64 && metric == MetricType::METRIC_L2 && vectors_.getSize(0) > 0) { @@ -59,7 +59,7 @@ void RaftFlatIndex::query( vectors_.data(), vectors_.getSize(0), vectors_.getSize(1)); auto search = raft::make_device_matrix_view( input.data(), input.getSize(0), input.getSize(1)); - auto inds = raft::make_device_matrix_view( + auto inds = raft::make_device_matrix_view( outIndices.data(), outIndices.getSize(0), outIndices.getSize(1)); diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh index d947f9b21d..6befc8744e 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cuh +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -52,7 +52,7 @@ class RaftFlatIndex : public FlatIndex { faiss::MetricType metric, float metricArg, Tensor& outDistances, - Tensor& outIndices, + Tensor& outIndices, bool exactDistance) override; }; diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 3820beb2b1..56deaa7865 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -123,7 +123,7 @@ void RaftIVFFlat::search( /// The input data must be on our current device. /// Returns the number of vectors successfully added. Vectors may /// not be able to be added because they contain NaNs. -int RaftIVFFlat::addVectors( +idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { @@ -163,7 +163,7 @@ void RaftIVFFlat::reset() { raft_knn_index.reset(); } -int RaftIVFFlat::getListLength(int listId) const { +idx_t RaftIVFFlat::getListLength(idx_t listId) const { printf("Inside RaftIVFFlat getListLength\n"); FAISS_ASSERT(raft_knn_index.has_value()); @@ -181,7 +181,7 @@ int RaftIVFFlat::getListLength(int listId) const { } /// Return the list indices of a particular list back to the CPU -std::vector RaftIVFFlat::getListIndices(int listId) const { +std::vector RaftIVFFlat::getListIndices(idx_t listId) const { printf("Inside RaftIVFFlat getListIndices\n"); FAISS_ASSERT(raft_knn_index.has_value()); @@ -213,7 +213,7 @@ std::vector RaftIVFFlat::getListIndices(int listId) const { } /// Return the encoded vectors of a particular list back to the CPU -std::vector RaftIVFFlat::getListVectorData(int listId, bool gpuFormat) +std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat) const { printf("Inside RaftIVFFlat getListVectorData\n"); diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index d0c17fd2e9..199e649eeb 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -76,7 +76,7 @@ class RaftIVFFlat : public IVFFlat { /// The input data must be on our current device. /// Returns the number of vectors successfully added. Vectors may /// not be able to be added because they contain NaNs. - int addVectors( + idx_t addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) override; @@ -87,13 +87,13 @@ class RaftIVFFlat : public IVFFlat { /// For debugging purposes, return the list length of a particular /// list - int getListLength(int listId) const override; + idx_t getListLength(idx_t listId) const override; /// Return the list indices of a particular list back to the CPU - std::vector getListIndices(int listId) const override; + std::vector getListIndices(idx_t listId) const override; /// Return the encoded vectors of a particular list back to the CPU - std::vector getListVectorData(int listId, bool gpuFormat) + std::vector getListVectorData(idx_t listId, bool gpuFormat) const override; void updateQuantizer(Index* quantizer) override; From 37ec2fa752c6bda2bdcdc9f4fa9605ee45ae0256 Mon Sep 17 00:00:00 2001 From: Alexandr Guzhva Date: Thu, 4 May 2023 06:28:00 -0700 Subject: [PATCH 58/87] Fix PR problems (#2839) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2839 Reviewed By: algoriddle Differential Revision: D45054275 fbshipit-source-id: 12eba11f5fb09eb80a8620bd60d5bb74df9b9ceb --- faiss/gpu/GpuDistance.cu | 2 -- faiss/gpu/StandardGpuResources.cpp | 17 ++++++++--------- faiss/gpu/impl/RaftFlatIndex.cu | 1 - faiss/gpu/impl/RaftUtils.h | 5 ++++- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index daf4710aec..130c2454ef 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -293,7 +293,6 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { search.view(), inds.view(), dists.view(), - k, distance, metric_arg); } @@ -328,7 +327,6 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { search.view(), inds.view(), dists.view(), - k, distance, metric_arg); } diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index af0f24c51e..c1be2fbf37 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -89,7 +89,13 @@ std::string allocsToString(const std::unordered_map& map) { // StandardGpuResourcesImpl::StandardGpuResourcesImpl() - : pinnedMemAlloc_(nullptr), + : +#if defined USE_NVIDIA_RAFT + cmr(new rmm::mr::cuda_memory_resource), + mmr(new rmm::mr::managed_memory_resource), + pmr(new rmm::mr::pinned_memory_resource), +#endif + pinnedMemAlloc_(nullptr), pinnedMemAllocSize_(0), // let the adjustment function determine the memory size for us by // passing in a huge value that will then be adjusted @@ -97,14 +103,7 @@ StandardGpuResourcesImpl::StandardGpuResourcesImpl() -1, std::numeric_limits::max())), pinnedMemSize_(kDefaultPinnedMemoryAllocation), - allocLogging_(false) -#if defined USE_NVIDIA_RAFT - , - cmr(new rmm::mr::cuda_memory_resource), - mmr(new rmm::mr::managed_memory_resource), - pmr(new rmm::mr::pinned_memory_resource) -#endif -{ + allocLogging_(false) { } StandardGpuResourcesImpl::~StandardGpuResourcesImpl() { diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index d407c68680..fb0f815368 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -109,7 +109,6 @@ void RaftFlatIndex::query( search, inds, dists, - k, distance, metricArg); } diff --git a/faiss/gpu/impl/RaftUtils.h b/faiss/gpu/impl/RaftUtils.h index 77c47999a5..6c744051ae 100644 --- a/faiss/gpu/impl/RaftUtils.h +++ b/faiss/gpu/impl/RaftUtils.h @@ -20,13 +20,16 @@ * limitations under the License. */ +#pragma once + #include +#include #include namespace faiss { namespace gpu { -raft::distance::DistanceType faiss_to_raft( +inline raft::distance::DistanceType faiss_to_raft( MetricType metric, bool exactDistance) { switch (metric) { From d3a98cc31228d15e796c4fef4aa431cbef22fcf5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 4 May 2023 13:56:06 -0400 Subject: [PATCH 59/87] Fixing cmakelists --- faiss/gpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index c742f8c49d..ff5d08878d 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -178,7 +178,7 @@ set(FAISS_GPU_HEADERS if(FAISS_ENABLE_RAFT) list(APPEND FAISS_GPU_HEADERS - impl/RaftIVFFlat.cuh) + impl/RaftIVFFlat.cuh impl/RaftFlatIndex.cuh) list(APPEND FAISS_GPU_SRC impl/RaftFlatIndex.cu From 0af95a471135b88449128754cf194122403cb691 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 4 May 2023 15:08:41 -0400 Subject: [PATCH 60/87] Updates --- faiss/gpu/GpuIndexIVF.cu | 2 +- faiss/gpu/impl/RaftIVFFlat.cu | 53 ++++++++++++++--------------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 99a778e882..dfc9631f7e 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -449,7 +449,7 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { if (config_.use_raft) { printf("Using raft to train quantizer for %d vectors\n", n); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::index_params raft_idx_params; diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 56deaa7865..7b92f1df12 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -96,7 +96,7 @@ void RaftIVFFlat::search( FAISS_ASSERT(n > 0); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::search_params pams; pams.n_probes = nprobe; @@ -109,12 +109,11 @@ void RaftIVFFlat::search( raft::make_device_matrix_view(outDistances.data(), n, k_); raft::neighbors::ivf_flat::search( raft_handle, + pams, *raft_knn_index, queries_view, out_inds_view, - out_dists_view, - pams, - k_); + out_dists_view); raft_handle.sync_stream(); } @@ -136,7 +135,7 @@ idx_t RaftIVFFlat::addVectors( auto inds_view = raft::make_device_vector_view( indices.data(), (idx_t)indices.getSize(0)); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); printf("About to call extend on index\n"); @@ -145,11 +144,11 @@ idx_t RaftIVFFlat::addVectors( if (raft_knn_index.has_value()) { raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( raft_handle, - raft_knn_index.value(), vecs_view, std::make_optional< raft::device_vector_view>( - inds_view))); + inds_view), + raft_knn_index.value())); } else { printf("Index has not been trained!\n"); @@ -167,7 +166,7 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const { printf("Inside RaftIVFFlat getListLength\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); uint32_t size; @@ -185,17 +184,11 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { printf("Inside RaftIVFFlat getListIndices\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - idx_t offset; uint32_t size; - raft::copy( - &offset, - raft_knn_index.value().list_offsets().data_handle() + listId, - 1, - raft_handle.get_stream()); raft::copy( &size, raft_knn_index.value().list_sizes().data_handle() + listId, @@ -206,7 +199,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { std::vector vec(size); raft::copy( vec.data(), - raft_knn_index.value().indices().data_handle() + offset, + *(raft_knn_index.value().inds_ptrs().data_handle() + listId), size, raft_handle.get_stream()); return vec; @@ -218,31 +211,26 @@ std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat printf("Inside RaftIVFFlat getListVectorData\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::handle_t& raft_handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); std::cout << "Calling getListVectorData for " << listId << std::endl; - using elem_t = decltype(raft_knn_index.value().data())::element_type; + using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type; size_t dim = raft_knn_index.value().dim(); - idx_t offsets[2]; - raft::copy( - offsets, - raft_knn_index.value().list_offsets().data_handle() + listId, - 2, - raft_handle.get_stream()); + uint32_t list_size; + + raft::copy(&list_size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); + - raft_handle.sync_stream(); - size_t byte_offset = offsets[0] * sizeof(elem_t) * dim; // the interleaved block can be slightly larger than the list size (it's // rounded up) - size_t byte_size = size_t(offsets[1]) * sizeof(elem_t) * dim - byte_offset; + size_t byte_size = size_t(list_size) * sizeof(elem_t) * dim; std::vector vec(byte_size); raft::copy( vec.data(), reinterpret_cast( - raft_knn_index.value().data().data_handle()) + - byte_offset, + raft_knn_index.value().data_ptrs().data_handle()+listId), byte_size, raft_handle.get_stream()); return vec; @@ -269,14 +257,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { std::cout << "Calling RAFT updateQuantizer with trained index with " << quantizer_ntotal << " items" << std::endl; - const raft::handle_t& handle = resources_->getRaftHandleCurrentDevice(); + const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice(); auto stream = handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); raft::logger::get().set_level(RAFT_LEVEL_TRACE); - raft::spatial::knn::ivf_flat::index_params pams; + raft::neighbors::ivf_flat::index_params pams; pams.add_data_on_build = false; switch (this->metric_) { @@ -297,6 +285,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { pams.metric, (uint32_t)this->numLists_, false, + false, (uint32_t)this->dim_); printf("Reconstructing\n"); @@ -327,7 +316,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // size_t ntotal = ivf ? ivf->compute_ntotal() : 0; // // printf("Inside RAFT copyInvertedListsFrom\n"); -// raft::handle_t &handle = resources_->getRaftHandleCurrentDevice(); +// raft::device_resources &handle = resources_->getRaftHandleCurrentDevice(); // // We need to allocate the IVF // printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal); // From 576f58fae3839e6c3de789df384483fe7ea45b8a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 28 Jun 2023 14:42:11 -0400 Subject: [PATCH 61/87] Fixing merge --- cmake/thirdparty/get_raft.cmake | 13 +++++ faiss/gpu/GpuIndexIVFFlat.cu | 15 ------ faiss/gpu/GpuIndexIVFFlat.h | 15 ------ gpu/Makefile | 96 --------------------------------- 4 files changed, 13 insertions(+), 126 deletions(-) delete mode 100644 gpu/Makefile diff --git a/cmake/thirdparty/get_raft.cmake b/cmake/thirdparty/get_raft.cmake index 567ac4814f..df5aa448e4 100644 --- a/cmake/thirdparty/get_raft.cmake +++ b/cmake/thirdparty/get_raft.cmake @@ -2,6 +2,19 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +# ============================================================================= +# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= set(RAFT_VERSION "${RAPIDS_VERSION}") set(RAFT_FORK "rapidsai") diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 19f03e9247..f257b09952 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -4,21 +4,6 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ #include #include diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index a4fa18417e..d7508feef4 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -4,21 +4,6 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ #pragma once diff --git a/gpu/Makefile b/gpu/Makefile deleted file mode 100644 index 072e089ddd..0000000000 --- a/gpu/Makefile +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2015-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the BSD+Patents license found in the -# LICENSE file in the root directory of this source tree. - --include ../makefile.inc - -all: libgpufaiss.a libgpufaiss.$(SHAREDEXT) - -CPPOBJ = GpuResources.o \ - IndexProxy.o \ - StandardGpuResources.o \ - GpuAutoTune.o \ - GpuClonerOptions.o \ - impl/RemapIndices.o \ - utils/DeviceMemory.o \ - utils/StackDeviceMemory.o \ - utils/DeviceUtils.o \ - utils/Timer.o \ - utils/MemorySpace.o \ - utils/WorkerThread.o - - -INS = 1 32 64 128 256 F512 T512 F1024 T1024 - -CUOBJ = impl/BroadcastSum.o \ - impl/Distance.o \ - impl/FlatIndex.o \ - impl/InvertedListAppend.o \ - impl/IVFBase.o \ - impl/IVFFlat.o \ - impl/IVFFlatScan.o \ - impl/IVFPQ.o \ - impl/IVFUtils.o \ - impl/IVFUtilsSelect1.o \ - impl/IVFUtilsSelect2.o \ - impl/L2Norm.o \ - impl/L2Select.o \ - impl/PQCodeDistances.o \ - impl/PQScanMultiPassNoPrecomputed.o \ - impl/PQScanMultiPassPrecomputed.o \ - impl/VectorResidual.o \ - GpuIndex.o \ - GpuIndexFlat.o \ - GpuIndexIVF.o \ - GpuIndexIVFFlat.o \ - GpuIndexIVFPQ.o \ - utils/Float16.o \ - utils/MatrixMult.o \ - utils/BlockSelectFloat.o \ - utils/BlockSelectHalf.o \ - utils/WarpSelectFloat.o \ - utils/WarpSelectHalf.o \ - utils/nvidia/fp16_emu.o \ - $(foreach v,$(INS), \ - utils/blockselect/BlockSelectHalf$(v).o \ - utils/blockselect/BlockSelectFloat$(v).o \ - utils/warpselect/WarpSelectHalf$(v).o \ - utils/warpselect/WarpSelectFloat$(v).o \ - ) - -%.o: %.cpp - $(CXX) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@ $(CUDACFLAGS) - -%.o: %.cu - $(NVCC) $(NVCCFLAGS) -g -O3 -c $< -o $@ - -libgpufaiss.a: $(CPPOBJ) $(CUOBJ) - ar r $@ $^ - -libgpufaiss.$(SHAREDEXT): $(CPPOBJ) $(CUOBJ) - $(CXX) $(SHAREDFLAGS) $(LDFLAGS) $(NVCCLDFLAGS) \ - -o libgpufaiss.$(SHAREDEXT) $^ $(LIBS) $(NVCCLIBS) - -clean: - rm -rf *.o impl/*.o utils/*.o libgpufaiss.a \ - libgpufaiss.$(SHAREDEXT) \ - --include depend - -depend: - for i in $(patsubst %.o,%.cpp,$(CPPOBJ)) \ - $(patsubst %.o,%.cu,$(CUOBJ)); do \ - $(CXXCPP) $(CPPFLAGS) -x c++ -MM $$i; \ - done > depend - -install: libgpufaiss.a libgpufaiss.$(SHAREDEXT) installdirs - cp libgpufaiss.a libgpufaiss.$(SHAREDEXT) $(DESTDIR)$(libdir) - cp *.h $(DESTDIR)$(includedir)/faiss/gpu - cp --parents **/**.h $(DESTDIR)$(includedir)/faiss/gpu - -installdirs: - $(MKDIR_P) $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss/gpu - -.PHONY: all clean From 092721f1b36ff4cbd913e013828b5d0f5806879f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 28 Jun 2023 18:05:20 -0400 Subject: [PATCH 62/87] Removing indexflat tests from changeset --- cmake/thirdparty/fetch_rapids.cmake | 2 +- faiss/gpu/test/TestGpuIndexFlat.cpp | 1113 +++++++++++++-------------- 2 files changed, 555 insertions(+), 560 deletions(-) diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake index 044a369606..229c488196 100644 --- a/cmake/thirdparty/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -15,7 +15,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -set(RAPIDS_VERSION "23.06") +set(RAPIDS_VERSION "23.08") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index 920336b0bc..fd63af0589 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -1,9 +1,9 @@ /** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ +* Copyright (c) Facebook, Inc. and its affiliates. +* +* This source code is licensed under the MIT license found in the +* LICENSE file in the root directory of this source tree. +*/ #include #include @@ -21,755 +21,750 @@ constexpr float kF16MaxRelErr = 0.07f; constexpr float kF32MaxRelErr = 6e-3f; struct TestFlatOptions { - TestFlatOptions() - : metric(faiss::MetricType::METRIC_L2), - metricArg(0), - useFloat16(false), - numVecsOverride(-1), - numQueriesOverride(-1), - kOverride(-1), - dimOverride(-1), - use_raft(false) {} - - faiss::MetricType metric; - float metricArg; - - bool useFloat16; - int numVecsOverride; - int numQueriesOverride; - int kOverride; - int dimOverride; - bool use_raft; + TestFlatOptions() + : metric(faiss::MetricType::METRIC_L2), + metricArg(0), + useFloat16(false), + numVecsOverride(-1), + numQueriesOverride(-1), + kOverride(-1), + dimOverride(-1), + use_raft(false) {} + + faiss::MetricType metric; + float metricArg; + + bool useFloat16; + int numVecsOverride; + int numQueriesOverride; + int kOverride; + int dimOverride; + bool use_raft; }; void testFlat(const TestFlatOptions& opt) { - int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride - : faiss::gpu::randVal(1000, 5000); - int dim = opt.dimOverride > 0 ? opt.dimOverride - : faiss::gpu::randVal(50, 800); - int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride - : faiss::gpu::randVal(1, 512); - - // Due to loss of precision in a float16 accumulator, for large k, - // the number of differences is pretty huge. Restrict ourselves to a - // fairly small `k` for float16 - int k = opt.useFloat16 - ? std::min(faiss::gpu::randVal(1, 50), numVecs) - : std::min( - faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()), - numVecs); - if (opt.kOverride > 0) { - k = opt.kOverride; - } - - faiss::IndexFlat cpuIndex(dim, opt.metric); - cpuIndex.metric_arg = opt.metricArg; - - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = true; - config.useFloat16 = opt.useFloat16; - config.use_raft = opt.use_raft; - - faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config); - gpuIndex.metric_arg = opt.metricArg; - - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndex.add(numVecs, vecs.data()); - gpuIndex.add(numVecs, vecs.data()); - - std::stringstream str; - str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs " - << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16 - << " numQuery " << numQuery << " k " << k; - - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - str.str(), - opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - opt.useFloat16 ? 0.99f : 0.1f, - opt.useFloat16 ? 0.65f : 0.015f); + int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride + : faiss::gpu::randVal(1000, 5000); + int dim = opt.dimOverride > 0 ? opt.dimOverride + : faiss::gpu::randVal(50, 800); + int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride + : faiss::gpu::randVal(1, 512); + + // Due to loss of precision in a float16 accumulator, for large k, + // the number of differences is pretty huge. Restrict ourselves to a + // fairly small `k` for float16 + int k = opt.useFloat16 + ? std::min(faiss::gpu::randVal(1, 50), numVecs) + : std::min( + faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()), + numVecs); + if (opt.kOverride > 0) { + k = opt.kOverride; + } + + faiss::IndexFlat cpuIndex(dim, opt.metric); + cpuIndex.metric_arg = opt.metricArg; + + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = opt.useFloat16; + config.use_raft = opt.use_raft; + + faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config); + gpuIndex.metric_arg = opt.metricArg; + + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndex.add(numVecs, vecs.data()); + gpuIndex.add(numVecs, vecs.data()); + + std::stringstream str; + str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs " + << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16 + << " numQuery " << numQuery << " k " << k; + + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + str.str(), + opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + opt.useFloat16 ? 0.99f : 0.1f, + opt.useFloat16 ? 0.65f : 0.015f); } TEST(TestGpuIndexFlat, IP_Float32) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; - opt.useFloat16 = false; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, L1_Float32) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L1; - opt.useFloat16 = false; + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L1; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif } TEST(TestGpuIndexFlat, Lp_Float32) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_Lp; - opt.metricArg = 5; - opt.useFloat16 = false; + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_Lp; + opt.metricArg = 5; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif } TEST(TestGpuIndexFlat, L2_Float32) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // At least one test for the k > 1024 select TEST(TestGpuIndexFlat, L2_k_2048) { - if (faiss::gpu::getMaxKSelection() >= 2048) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.kOverride = 2048; - opt.dimOverride = 128; - opt.numVecsOverride = 10000; - - testFlat(opt); + if (faiss::gpu::getMaxKSelection() >= 2048) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.kOverride = 2048; + opt.dimOverride = 128; + opt.numVecsOverride = 10000; + + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test specialized k == 1 codepath TEST(TestGpuIndexFlat, L2_Float32_K1) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.kOverride = 1; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.kOverride = 1; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, IP_Float16) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; - opt.useFloat16 = true; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; + opt.useFloat16 = true; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, L2_Float16) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = true; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = true; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test specialized k == 1 codepath TEST(TestGpuIndexFlat, L2_Float16_K1) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = true; - opt.kOverride = 1; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = true; + opt.kOverride = 1; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test tiling along a huge vector set TEST(TestGpuIndexFlat, L2_Tiling) { - for (int tries = 0; tries < 2; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.numVecsOverride = 1000000; - - // keep the rest of the problem reasonably small - opt.numQueriesOverride = 4; - opt.dimOverride = 64; - opt.kOverride = 64; - - testFlat(opt); + for (int tries = 0; tries < 2; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.numVecsOverride = 1000000; + + // keep the rest of the problem reasonably small + opt.numQueriesOverride = 4; + opt.dimOverride = 64; + opt.kOverride = 64; + + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, QueryEmpty) { - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - faiss::gpu::GpuIndexFlatConfig config; - config.device = 0; - config.use_raft = true; - config.useFloat16 = false; - int dim = 128; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + faiss::gpu::GpuIndexFlatConfig config; + config.device = 0; + config.useFloat16 = false; + int dim = 128; + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - // Querying an empty index should not blow up, and just return - // (FLT_MAX, -1) - int numQuery = 10; - int k = 50; - std::vector queries(numQuery * dim, 1.0f); + // Querying an empty index should not blow up, and just return + // (FLT_MAX, -1) + int numQuery = 10; + int k = 50; + std::vector queries(numQuery * dim, 1.0f); - std::vector dist(numQuery * k, 0); - std::vector ind(numQuery * k); + std::vector dist(numQuery * k, 0); + std::vector ind(numQuery * k); - gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data()); + gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data()); - for (auto d : dist) { - EXPECT_EQ(d, std::numeric_limits::max()); - } + for (auto d : dist) { + EXPECT_EQ(d, std::numeric_limits::max()); + } - for (auto i : ind) { - EXPECT_EQ(i, -1); - } + for (auto i : ind) { + EXPECT_EQ(i, -1); + } } void testCopyFrom(bool use_raft) { - int numVecs = faiss::gpu::randVal(100, 200); - int dim = faiss::gpu::randVal(1, 1000); + int numVecs = faiss::gpu::randVal(100, 200); + int dim = faiss::gpu::randVal(1, 1000); - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - faiss::IndexFlatL2 cpuIndex(dim); - cpuIndex.add(numVecs, vecs.data()); + faiss::IndexFlatL2 cpuIndex(dim); + cpuIndex.add(numVecs, vecs.data()); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = true; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; - // Fill with garbage values - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config); - gpuIndex.copyFrom(&cpuIndex); + // Fill with garbage values + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config); + gpuIndex.copyFrom(&cpuIndex); - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, numVecs); + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, numVecs); - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, dim); + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, dim); - std::vector gpuVals(numVecs * dim); - gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); + std::vector gpuVals(numVecs * dim); + gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); - std::vector cpuVals(numVecs * dim); - cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); + std::vector cpuVals(numVecs * dim); + cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); - // The CPU is the source of (float32) truth here, while the GPU index - // may be in float16 mode and thus was subject to rounding - if (useFloat16) { - EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals)); - } else { - // Should be exactly the same - EXPECT_EQ(gpuVals, cpuVals); - } - } + // The CPU is the source of (float32) truth here, while the GPU index + // may be in float16 mode and thus was subject to rounding + if (useFloat16) { + EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals)); + } else { + // Should be exactly the same + EXPECT_EQ(gpuVals, cpuVals); + } + } } TEST(TestGpuIndexFlat, CopyFrom) { - testCopyFrom(false); + testCopyFrom(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, CopyFrom) { - testCopyFrom(true); + testCopyFrom(true); } #endif void testCopyTo(bool use_raft) { - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - int numVecs = faiss::gpu::randVal(100, 200); - int dim = faiss::gpu::randVal(1, 1000); + int numVecs = faiss::gpu::randVal(100, 200); + int dim = faiss::gpu::randVal(1, 1000); - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = true; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - gpuIndex.add(numVecs, vecs.data()); + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + gpuIndex.add(numVecs, vecs.data()); - // Fill with garbage values - faiss::IndexFlatL2 cpuIndex(2000); - gpuIndex.copyTo(&cpuIndex); + // Fill with garbage values + faiss::IndexFlatL2 cpuIndex(2000); + gpuIndex.copyTo(&cpuIndex); - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, numVecs); + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, numVecs); - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, dim); + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, dim); - std::vector gpuVals(numVecs * dim); - gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); + std::vector gpuVals(numVecs * dim); + gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); - std::vector cpuVals(numVecs * dim); - cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); + std::vector cpuVals(numVecs * dim); + cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); - // The GPU is the source of truth here, so the float32 exact comparison - // even if the index uses float16 is ok - EXPECT_EQ(gpuVals, cpuVals); - } + // The GPU is the source of truth here, so the float32 exact comparison + // even if the index uses float16 is ok + EXPECT_EQ(gpuVals, cpuVals); + } } TEST(TestGpuIndexFlat, CopyTo) { - testCopyTo(false); + testCopyTo(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, CopyTo) { - testCopyTo(true); + testCopyTo(true); } #endif void testUnifiedMemory(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - if (!faiss::gpu::getFullUnifiedMemSupport(device)) { - return; - } - - int dim = 256; - - // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to - // kernel indexing, so we can't test unified memory for memory - // oversubscription. - size_t numVecs = 50000; - int numQuery = 10; - int k = 10; - - faiss::IndexFlatL2 cpuIndexL2(dim); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = true; - config.memorySpace = faiss::gpu::MemorySpace::Unified; - config.use_raft = use_raft; - - faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); - - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndexL2.add(numVecs, vecs.data()); - gpuIndexL2.add(numVecs, vecs.data()); - - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndexL2, - gpuIndexL2, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + if (!faiss::gpu::getFullUnifiedMemSupport(device)) { + return; + } + + int dim = 256; + + // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to + // kernel indexing, so we can't test unified memory for memory + // oversubscription. + size_t numVecs = 50000; + int numQuery = 10; + int k = 10; + + faiss::IndexFlatL2 cpuIndexL2(dim); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.memorySpace = faiss::gpu::MemorySpace::Unified; + config.use_raft = use_raft; + + faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); + + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndexL2.add(numVecs, vecs.data()); + gpuIndexL2.add(numVecs, vecs.data()); + + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndexL2, + gpuIndexL2, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); } TEST(TestGpuIndexFlat, UnifiedMemory) { - testUnifiedMemory(false); + testUnifiedMemory(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, UnifiedMemory) { - testUnifiedMemory(true); + testUnifiedMemory(true); } #endif void testLargeIndex(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - // Skip this device if we do not have sufficient memory - constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024; + // Skip this device if we do not have sufficient memory + constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024; - if (faiss::gpu::getFreeMemory(device) < kMem) { - std::cout << "TestGpuIndexFlat.LargeIndex: skipping due " - "to insufficient device memory\n"; - return; - } + if (faiss::gpu::getFreeMemory(device) < kMem) { + std::cout << "TestGpuIndexFlat.LargeIndex: skipping due " + "to insufficient device memory\n"; + return; + } - std::cout << "Running LargeIndex test\n"; + std::cout << "Running LargeIndex test\n"; - size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size - size_t nb = 5000000; - size_t nq = 10; + size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size + size_t nb = 5000000; + size_t nq = 10; - auto xb = faiss::gpu::randVecs(nb, dim); + auto xb = faiss::gpu::randVecs(nb, dim); - int k = 10; + int k = 10; - faiss::IndexFlatL2 cpuIndexL2(dim); + faiss::IndexFlatL2 cpuIndexL2(dim); - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); - cpuIndexL2.add(nb, xb.data()); - gpuIndexL2.add(nb, xb.data()); + cpuIndexL2.add(nb, xb.data()); + gpuIndexL2.add(nb, xb.data()); - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndexL2, - gpuIndexL2, - nq, - dim, - k, - "LargeIndex", - kF32MaxRelErr, - 0.1f, - 0.015f); + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndexL2, + gpuIndexL2, + nq, + dim, + k, + "LargeIndex", + kF32MaxRelErr, + 0.1f, + 0.015f); } TEST(TestGpuIndexFlat, LargeIndex) { - testLargeIndex(false); + testLargeIndex(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, LargeIndex) { - testLargeIndex(true); + testLargeIndex(true); } #endif void testResidual(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; - int dim = 32; - faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2); - faiss::gpu::GpuIndexFlat gpuIndex( - &res, dim, faiss::MetricType::METRIC_L2, config); + int dim = 32; + faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2); + faiss::gpu::GpuIndexFlat gpuIndex( + &res, dim, faiss::MetricType::METRIC_L2, config); - int numVecs = 100; - auto vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndex.add(numVecs, vecs.data()); - gpuIndex.add(numVecs, vecs.data()); + int numVecs = 100; + auto vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndex.add(numVecs, vecs.data()); + gpuIndex.add(numVecs, vecs.data()); - auto indexVecs = std::vector{0, 2, 4, 6, 8}; - auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim); + auto indexVecs = std::vector{0, 2, 4, 6, 8}; + auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim); - auto residualsCpu = std::vector(indexVecs.size() * dim); - auto residualsGpu = std::vector(indexVecs.size() * dim); + auto residualsCpu = std::vector(indexVecs.size() * dim); + auto residualsGpu = std::vector(indexVecs.size() * dim); - cpuIndex.compute_residual_n( - indexVecs.size(), - queryVecs.data(), - residualsCpu.data(), - indexVecs.data()); - gpuIndex.compute_residual_n( - indexVecs.size(), - queryVecs.data(), - residualsGpu.data(), - indexVecs.data()); + cpuIndex.compute_residual_n( + indexVecs.size(), + queryVecs.data(), + residualsCpu.data(), + indexVecs.data()); + gpuIndex.compute_residual_n( + indexVecs.size(), + queryVecs.data(), + residualsGpu.data(), + indexVecs.data()); - // Should be exactly the same, as this is just a single float32 subtraction - EXPECT_EQ(residualsCpu, residualsGpu); + // Should be exactly the same, as this is just a single float32 subtraction + EXPECT_EQ(residualsCpu, residualsGpu); } TEST(TestGpuIndexFlat, Residual) { - testResidual(false); + testResidual(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, Residual) { - testResidual(true); + testResidual(true); } #endif void testReconstruct(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - int dim = 32; - int numVecs = 100; - auto vecs = faiss::gpu::randVecs(numVecs, dim); - auto vecs16 = faiss::gpu::roundToHalf(vecs); - - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; - - faiss::gpu::GpuIndexFlat gpuIndex( - &res, dim, faiss::MetricType::METRIC_L2, config); - - gpuIndex.add(numVecs, vecs.data()); - - // Test reconstruct - { - auto reconstructVecs = std::vector(dim); - gpuIndex.reconstruct(15, reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < dim; ++i) { - EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]); - } - } - - // Test reconstruct_n - if (false) { - auto reconstructVecs = std::vector((numVecs - 1) * dim); - - int startVec = 5; - int endVec = numVecs - 1; - int numReconstructVec = endVec - startVec + 1; - - gpuIndex.reconstruct_n( - startVec, numReconstructVec, reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < numReconstructVec; ++i) { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ( - reconstructVecs[i * dim + j], - ref[(i + startVec) * dim + j]); - } - } - } - - // Test reconstruct_batch - if (false) { - auto reconstructKeys = std::vector{1, 3, 5}; - auto reconstructVecs = - std::vector(reconstructKeys.size() * dim); - - gpuIndex.reconstruct_batch( - reconstructKeys.size(), - reconstructKeys.data(), - reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < reconstructKeys.size(); ++i) { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ( - reconstructVecs[i * dim + j], - ref[reconstructKeys[i] * dim + j]); - } - } - } - } + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + int dim = 32; + int numVecs = 100; + auto vecs = faiss::gpu::randVecs(numVecs, dim); + auto vecs16 = faiss::gpu::roundToHalf(vecs); + + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; + + faiss::gpu::GpuIndexFlat gpuIndex( + &res, dim, faiss::MetricType::METRIC_L2, config); + + gpuIndex.add(numVecs, vecs.data()); + + // Test reconstruct + { + auto reconstructVecs = std::vector(dim); + gpuIndex.reconstruct(15, reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < dim; ++i) { + EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]); + } + } + + // Test reconstruct_n + if (false) { + auto reconstructVecs = std::vector((numVecs - 1) * dim); + + int startVec = 5; + int endVec = numVecs - 1; + int numReconstructVec = endVec - startVec + 1; + + gpuIndex.reconstruct_n( + startVec, numReconstructVec, reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < numReconstructVec; ++i) { + for (int j = 0; j < dim; ++j) { + EXPECT_EQ( + reconstructVecs[i * dim + j], + ref[(i + startVec) * dim + j]); + } + } + } + + // Test reconstruct_batch + if (false) { + auto reconstructKeys = std::vector{1, 3, 5}; + auto reconstructVecs = + std::vector(reconstructKeys.size() * dim); + + gpuIndex.reconstruct_batch( + reconstructKeys.size(), + reconstructKeys.data(), + reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < reconstructKeys.size(); ++i) { + for (int j = 0; j < dim; ++j) { + EXPECT_EQ( + reconstructVecs[i * dim + j], + ref[reconstructKeys[i] * dim + j]); + } + } + } + } } TEST(TestGpuIndexFlat, Reconstruct) { - testReconstruct(false); + testReconstruct(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, Reconstruct) { - testReconstruct(true); + testReconstruct(true); } #endif void testSearchAndReconstruct(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - size_t dim = 32; - size_t nb = 5000; - size_t nq = 10; - int k = 10; - - auto xb = faiss::gpu::randVecs(nb, dim); - auto xq = faiss::gpu::randVecs(nq, dim); - - faiss::IndexFlatL2 cpuIndex(dim); - - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - - cpuIndex.add(nb, xb.data()); - gpuIndex.add(nb, xb.data()); - - std::vector refDistance(nq * k, 0); - std::vector refIndices(nq * k, -1); - std::vector refReconstruct(nq * k * dim, 0); - cpuIndex.search_and_reconstruct( - nq, - xq.data(), - k, - refDistance.data(), - refIndices.data(), - refReconstruct.data()); - - std::vector testDistance(nq * k, 0); - std::vector testIndices(nq * k, -1); - std::vector testReconstruct(nq * k * dim, 0); - gpuIndex.search_and_reconstruct( - nq, - xq.data(), - k, - testDistance.data(), - testIndices.data(), - testReconstruct.data()); - - // This handles the search results - faiss::gpu::compareLists( - refDistance.data(), - refIndices.data(), - testDistance.data(), - testIndices.data(), - nq, - k, - "SearchAndReconstruct", - true, - false, - true, - kF32MaxRelErr, - 0.1f, - 0.015f); - - // As the search results may be slightly different (though compareLists - // above will ensure a decent number of matches), reconstruction should be - // the same for the vectors that do match - for (int i = 0; i < nq; ++i) { - std::unordered_map refLocation; - - for (int j = 0; j < k; ++j) { - refLocation.insert(std::make_pair(refIndices[i * k + j], j)); - } - - for (int j = 0; j < k; ++j) { - auto idx = testIndices[i * k + j]; - auto it = refLocation.find(idx); - if (it != refLocation.end()) { - for (int d = 0; d < dim; ++d) { - EXPECT_EQ( - refReconstruct[(i * k + it->second) * dim + d], - testReconstruct[(i * k + j) * dim + d]); - } - } - } - } + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + size_t dim = 32; + size_t nb = 5000; + size_t nq = 10; + int k = 10; + + auto xb = faiss::gpu::randVecs(nb, dim); + auto xq = faiss::gpu::randVecs(nq, dim); + + faiss::IndexFlatL2 cpuIndex(dim); + + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + + cpuIndex.add(nb, xb.data()); + gpuIndex.add(nb, xb.data()); + + std::vector refDistance(nq * k, 0); + std::vector refIndices(nq * k, -1); + std::vector refReconstruct(nq * k * dim, 0); + cpuIndex.search_and_reconstruct( + nq, + xq.data(), + k, + refDistance.data(), + refIndices.data(), + refReconstruct.data()); + + std::vector testDistance(nq * k, 0); + std::vector testIndices(nq * k, -1); + std::vector testReconstruct(nq * k * dim, 0); + gpuIndex.search_and_reconstruct( + nq, + xq.data(), + k, + testDistance.data(), + testIndices.data(), + testReconstruct.data()); + + // This handles the search results + faiss::gpu::compareLists( + refDistance.data(), + refIndices.data(), + testDistance.data(), + testIndices.data(), + nq, + k, + "SearchAndReconstruct", + true, + false, + true, + kF32MaxRelErr, + 0.1f, + 0.015f); + + // As the search results may be slightly different (though compareLists + // above will ensure a decent number of matches), reconstruction should be + // the same for the vectors that do match + for (int i = 0; i < nq; ++i) { + std::unordered_map refLocation; + + for (int j = 0; j < k; ++j) { + refLocation.insert(std::make_pair(refIndices[i * k + j], j)); + } + + for (int j = 0; j < k; ++j) { + auto idx = testIndices[i * k + j]; + auto it = refLocation.find(idx); + if (it != refLocation.end()) { + for (int d = 0; d < dim; ++d) { + EXPECT_EQ( + refReconstruct[(i * k + it->second) * dim + d], + testReconstruct[(i * k + j) * dim + d]); + } + } + } + } } TEST(TestGpuIndexFlat, SearchAndReconstruct) { - testSearchAndReconstruct(false); + testSearchAndReconstruct(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) { - testSearchAndReconstruct(true); + testSearchAndReconstruct(true); } #endif int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); + testing::InitGoogleTest(&argc, argv); - // just run with a fixed test seed - faiss::gpu::setTestSeed(100); + // just run with a fixed test seed + faiss::gpu::setTestSeed(100); - return RUN_ALL_TESTS(); -} + return RUN_ALL_TESTS(); +} \ No newline at end of file From 1c621add67996dbdf5e420fb3246cd74f20d38c5 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 7 Jul 2023 15:47:53 -0700 Subject: [PATCH 63/87] First version of copyFrom and copyTo --- faiss/gpu/impl/RaftIVFFlat.cu | 330 ++++++++++++++++++++------------- faiss/gpu/impl/RaftIVFFlat.cuh | 63 +++++-- 2 files changed, 254 insertions(+), 139 deletions(-) diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 7b92f1df12..645c4c4840 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -45,6 +45,10 @@ #include #include + +#include +#include + #include namespace faiss { @@ -187,14 +191,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - uint32_t size; - - raft::copy( - &size, - raft_knn_index.value().list_sizes().data_handle() + listId, - 1, - raft_handle.get_stream()); - raft_handle.sync_stream(); + uint32_t size = getListLength(listId); std::vector vec(size); raft::copy( @@ -202,6 +199,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { *(raft_knn_index.value().inds_ptrs().data_handle() + listId), size, raft_handle.get_stream()); + raft_handle.sync_stream(); return vec; } @@ -218,22 +216,22 @@ std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type; size_t dim = raft_knn_index.value().dim(); - uint32_t list_size; - - raft::copy(&list_size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); - + idx_t list_size = getListLength(listId); // the interleaved block can be slightly larger than the list size (it's // rounded up) - size_t byte_size = size_t(list_size) * sizeof(elem_t) * dim; - std::vector vec(byte_size); - raft::copy( - vec.data(), - reinterpret_cast( - raft_knn_index.value().data_ptrs().data_handle()+listId), - byte_size, - raft_handle.get_stream()); - return vec; + size_t nblocks = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize); + size_t interleavedCodeSize = nblocks * raft::neighbors::ivf_flat::kIndexGroupSize * dim * sizeof(elem_t); + size_t flat_code_size = list_size * dim * sizeof(elem_t); + std::vector interleaved_codes(interleavedCodeSize); + std::vector flat_codes(flat_code_size); + + RaftIVFFlatCodePackerFlat p(resources_, interleavedCodeSize); + p.unpack_1(reinterpret_cast( + raft_knn_index.value().data_ptrs().data_handle()+listId), 0, interleaved_codes.data()); + RaftIVFFlatCodePackerInterleaved up((size_t)list_size, (size_t)dim, (size_t)raft_knn_index.value().veclen()); + up.unpack_1(interleaved_codes.data(), 0, flat_codes.data()); + return flat_codes; } /// Performs search when we are already given the IVF cells to look at @@ -311,117 +309,201 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // // -// void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { -// size_t nlist = ivf ? ivf->nlist : 0; -// size_t ntotal = ivf ? ivf->compute_ntotal() : 0; -// -// printf("Inside RAFT copyInvertedListsFrom\n"); -// raft::device_resources &handle = resources_->getRaftHandleCurrentDevice(); -// // We need to allocate the IVF -// printf("nlist=%ld, ntotal=%ld\n", nlist, ntotal); -// -// std::vector list_sizes_(nlist); -// std::vector list_offsets_(nlist+1); -// std::vector indices_(ntotal); -// -// raft::neighbors::ivf_flat::index_params raft_idx_params; -// raft_idx_params.n_lists = nlist; -// raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; -// raft_idx_params.add_data_on_build = false; -// raft_idx_params.kmeans_n_iters = 100; +void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { + size_t nlist = ivf ? ivf->nlist : 0; + size_t ntotal = ivf ? ivf->compute_ntotal() : 0; + + raft::device_resources &handle = resources_->getRaftHandleCurrentDevice(); + + std::vector list_sizes_(nlist); + std::vector list_offsets_(nlist+1); + std::vector indices_(ntotal); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_n_iters = 100; // -// raft_knn_index.emplace(handle, raft_idx_params, dim_); + raft_knn_index.emplace(handle, raft_idx_params, dim_); // raft_knn_index.value().allocate(handle, ntotal, true); // -// for (size_t i = 0; i < nlist; ++i) { -// size_t listSize = ivf->list_size(i); -// -// // GPU index can only support max int entries per list -// FAISS_THROW_IF_NOT_FMT( -// listSize <= (size_t)std::numeric_limits::max(), -// "GPU inverted list can only support " -// "%zu entries; %zu found", -// (size_t)std::numeric_limits::max(), -// listSize); + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + list_sizes_[i] = listSize; + + // GPU index can only support max int entries per list + FAISS_THROW_IF_NOT_FMT( + listSize <= (size_t)std::numeric_limits::max(), + "GPU inverted list can only support " + "%zu entries; %zu found", + (size_t)std::numeric_limits::max(), + listSize); + + addEncodedVectorsToList_( + i, ivf->get_codes(i), ivf->get_ids(i), listSize); + } // -// addEncodedVectorsToList_( -// i, ivf->get_codes(i), ivf->get_ids(i), listSize); -// } -// -// raft::update_device(raft_knn_index.value().list_sizes().data_handle(), -// list_sizes_.data(), nlist, handle.get_stream()); + raft::update_device(raft_knn_index.value().list_sizes().data_handle(), + list_sizes_.data(), nlist, handle.get_stream()); // raft::update_device(raft_knn_index.value().list_offsets().data_handle(), // list_offsets_.data(), nlist+1, handle.get_stream()); // -//} - -// void RaftIVFFlat::addEncodedVectorsToList_( -// int listId, -// const void* codes, -// const Index::idx_t* indices, -// size_t numVecs) { -// auto stream = resources_->getDefaultStreamCurrentDevice(); -// -// // This list must already exist -//// FAISS_ASSERT(listId < deviceListData_.size()); -// -// // This list must currently be empty -//// auto& listCodes = deviceListData_[listId]; -//// FAISS_ASSERT(listCodes->data.size() == 0); -//// FAISS_ASSERT(listCodes->numVecs == 0); -// -// // If there's nothing to add, then there's nothing we have to do -// if (numVecs == 0) { -// return; -// } -// -// // The GPU might have a different layout of the memory -// auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); -// auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); -// -// // We only have int32 length representations on the GPU per each -// // list; the length is in sizeof(char) -// FAISS_ASSERT(gpuListSizeInBytes <= -// (size_t)std::numeric_limits::max()); -// -// // Translate the codes as needed to our preferred form -// std::vector codesV(cpuListSizeInBytes); -// std::memcpy(codesV.data(), codes, cpuListSizeInBytes); +} + +size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { + idx_t bits = 32 /* float */; + + // bytes to encode a block of 32 vectors (single dimension) + idx_t bytesPerDimBlock = bits * 32 / 8; // = 128 + + // bytes to fully encode 32 vectors + idx_t bytesPerBlock = bytesPerDimBlock * dim_; + + // number of blocks of 32 vectors we have + idx_t numBlocks = utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize); + + // total size to encode numVecs + return bytesPerBlock * numBlocks; +} + + +void RaftIVFFlat::addEncodedVectorsToList_( + idx_t listId, + const void* codes, + const idx_t* indices, + idx_t numVecs) { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + // This list must already exist + FAISS_ASSERT(raft_knn_index.has_value()); + + // This list must currently be empty + FAISS_ASSERT(getListLength(listId) == 0); + + // If there's nothing to add, then there's nothing we have to do + if (numVecs == 0) { + return; + } + + // The GPU might have a different layout of the memory + auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); + auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); + + // We only have int32 length representations on the GPU per each + // list; the length is in sizeof(char) + FAISS_ASSERT(gpuListSizeInBytes <= + (size_t)std::numeric_limits::max()); + + // Translate the codes as needed to our preferred form + std::vector codesV(cpuListSizeInBytes); + std::memcpy(codesV.data(), codes, cpuListSizeInBytes); // auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); -// -// std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << -// gpuListSizeInBytes << std::endl; -// -//// -/// RAFT_CUDA_TRY(cudaMemcpyAsync(raft_knn_index.value().data().data_handle()+(), -/// translatedCodes.data(), )) -// -//// listCodes->data.append( -//// translatedCodes.data(), -//// gpuListSizeInBytes, -//// stream, -//// true /* exact reserved size */); -//// listCodes->numVecs = numVecs; -//// -//// // Handle the indices as well -//// addIndicesFromCpu_(listId, indices, numVecs); -//// -// -// // We should problay consider using this... -//// deviceListDataPointers_.setAt( -//// listId, (void*)listCodes->data.data(), stream); -//// deviceListLengths_.setAt(listId, (int)numVecs, stream); -//// -//// // We update this as well, since the multi-pass algorithm uses it -//// maxListLength_ = std::max(maxListLength_, (int)numVecs); -//} - -///// Copy all inverted lists from ourselves to a CPU representation -// void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { -// printf("Inside RaftIVFFlat copyInvertedListsTo\n"); -// -// // TODO: Need to replicate copyInvertedListsTo() in IVFBase.cu -//} + + std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << + gpuListSizeInBytes << std::endl; +// utils::divUp(numVecs, 32); + RaftIVFFlatCodePackerInterleaved transform_packer((size_t)numVecs, (size_t)dim_, (size_t)raft_knn_index.value().veclen()); + std::vector interleaved_codes(gpuListSizeInBytes); + std::memcpy(codesV.data(), codes, cpuListSizeInBytes); + transform_packer.pack_1(codesV.data(), 0, interleaved_codes.data()); + RaftIVFFlatCodePackerFlat copy_packer(resources_, cpuListSizeInBytes); + copy_packer.unpack_1(interleaved_codes.data(), 0, (uint8_t*)(raft_knn_index.value().data_ptrs().data_handle() + listId)); + + uint32_t size = numVecs; + raft::update_device(raft_knn_index.value().list_sizes().data_handle() + listId, &size, 1, stream); + + // Handle the indices as well + raft::update_device((idx_t*)(raft_knn_index.value().inds_ptrs().data_handle() + listId), indices, numVecs, stream); +} +/// Copy all inverted lists from ourselves to a CPU representation +void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { + printf("Inside RaftIVFFlat copyInvertedListsTo\n"); + + for (idx_t i = 0; i < numLists_; ++i) { + auto listIndices = getListIndices(i); + auto listData = getListVectorData(i, false); + + ivf->add_entries( + i, listIndices.size(), listIndices.data(), listData.data()); + } +} + +// std::vector RaftIVFFlat::translateCodesToGpu_( +// std::vector codes, +// std::vector block, +// idx_t numVecs) const { +// if (!interleavedLayout_) { +// // same format +// return codes; +// } +// RaftIVFFlatCodePackerInterleaved packer; +// packer::pack_all(codes.data(), block.data()); +// } + +// std::vector RaftIVFFlat::translateCodesFromGpu_( +// std::vector codes, +// idx_t numVecs) const { +// if (!interleavedLayout_) { +// // same format +// return codes; +// } + +// RaftIVFFlatCodePackerFlat packer; +// packer::unpack_all(block.data(), codes.data()); +// } + + + +RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen) { + this->list_size = list_size; + this->dim = dim; + this->veclen = veclen; + nvec = 1; + code_size = list_size * dim * sizeof(uint32_t); + block_size = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize); +} + +void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { + FAISS_ASSERT(offset == 0); + raft::neighbors::ivf_flat::helpers::pack_host_interleaved( + flat_code, + block, + nvec, + dim, + veclen); +} + +void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const { + FAISS_ASSERT(offset == 0); + raft::neighbors::ivf_flat::helpers::unpack_host_interleaved( + block, + flat_code, + nvec, + dim, + veclen); +} + +RaftIVFFlatCodePackerFlat::RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size) { + this->resources = resources_; + nvec = 1; + code_size = code_size; + block_size = code_size; +} + +void RaftIVFFlatCodePackerFlat::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { + FAISS_ASSERT(offset == 0); + const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice(); + raft::update_device(block, flat_code, code_size * nvec, raft_handle.get_stream()); +} + +void RaftIVFFlatCodePackerFlat::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const { + FAISS_ASSERT(offset == 0); + const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice(); + raft::update_host(flat_code, block, code_size * nvec, raft_handle.get_stream()); + raft_handle.sync_stream(); +} + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 199e649eeb..8f81bd8c42 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -27,6 +27,9 @@ #include #include +#include + +#include #include @@ -98,27 +101,57 @@ class RaftIVFFlat : public IVFFlat { void updateQuantizer(Index* quantizer) override; - // - // /// Copy all inverted lists from a CPU representation to ourselves - // void copyInvertedListsFrom(const InvertedLists* ivf) override; - // - // /// Copy all inverted lists from ourselves to a CPU representation - // void copyInvertedListsTo(InvertedLists* ivf) override; + /// Copy all inverted lists from a CPU representation to ourselves + void copyInvertedListsFrom(const InvertedLists* ivf) override; + + /// Copy all inverted lists from ourselves to a CPU representation + void copyInvertedListsTo(InvertedLists* ivf) override; protected: - // /// Adds a set of codes and indices to a list, with the representation - // /// coming from the CPU equivalent - // void addEncodedVectorsToList_( - // int listId, - // // resident on the host - // const void* codes, - // // resident on the host - // const Index::idx_t* indices, - // size_t numVecs) override; + /// Adds a set of codes and indices to a list, with the representation + /// coming from the CPU equivalent + void addEncodedVectorsToList_( + idx_t listId, + // resident on the host + const void* codes, + // resident on the host + const idx_t* indices, + idx_t numVecs) override; + + /// Returns the number of bytes in which an IVF list containing numVecs + /// vectors is encoded on the device. Note that due to padding this is not + /// the same as the encoding size for a subset of vectors in an IVF list; + /// this is the size for an entire IVF list + size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override; std::optional> raft_knn_index{std::nullopt}; }; + +struct RaftIVFFlatCodePackerInterleaved : CodePacker { + RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen); + void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) + const final; + void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) + const final; + + protected: + size_t list_size; + size_t veclen; + size_t dim; +}; +struct RaftIVFFlatCodePackerFlat : CodePacker { + RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size); + void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) + const final; + void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) + const final; + + protected: + GpuResources* resources; +}; + + } // namespace gpu } // namespace faiss From b8d616d97b8cbfe6cda6685ffe53dae8089a995d Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 24 Jul 2023 18:11:26 -0700 Subject: [PATCH 64/87] Update copyFrom and copyTo --- faiss/gpu/GpuIndexIVFFlat.cu | 21 +-- faiss/gpu/impl/IVFBase.cu | 1 + faiss/gpu/impl/RaftIVFFlat.cu | 234 +++++++++++++++++++-------------- faiss/gpu/impl/RaftIVFFlat.cuh | 18 +-- 4 files changed, 139 insertions(+), 135 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index f257b09952..7d2a94d595 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -177,24 +177,9 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.indicesOptions, config_.memorySpace); - if (config_.use_raft) { - printf("Reconstructing %d original vectors and adding to GPU index\n", - ntotal); - - // Quantizer should already have been updated above. Add reconstructed - // vectors to raft index - if (ntotal > 0) { - std::vector buf_host(ntotal * d); - std::vector ids(ntotal); - std::iota(ids.begin(), ids.end(), 0); - index->reconstruct_n(0, ntotal, buf_host.data()); - add_with_ids(ntotal, buf_host.data(), ids.data()); - } - } else { - // Copy all of the IVF data - printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); - index_->copyInvertedListsFrom(index->invlists); - } + // Copy all of the IVF data + printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); + index_->copyInvertedListsFrom(index->invlists); } void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const { diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu index 890d489440..1e2f414fc0 100644 --- a/faiss/gpu/impl/IVFBase.cu +++ b/faiss/gpu/impl/IVFBase.cu @@ -323,6 +323,7 @@ std::vector IVFBase::getListVectorData(idx_t listId, bool gpuFormat) } void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) { + printf("inside ivf-flat's copyInvertedListsFrom\n"); idx_t nlist = ivf ? ivf->nlist : 0; for (idx_t i = 0; i < nlist; ++i) { addEncodedVectorsToList_( diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 645c4c4840..e543d0ff5f 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -190,15 +190,19 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); + auto stream = raft_handle.get_stream(); - uint32_t size = getListLength(listId); + idx_t listSize = getListLength(listId); - std::vector vec(size); - raft::copy( - vec.data(), - *(raft_knn_index.value().inds_ptrs().data_handle() + listId), - size, - raft_handle.get_stream()); + std::vector vec(listSize); + + idx_t* list_indices_ptr; + + // fetch the list indices ptr on host + raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft_handle.sync_stream(); + + raft::update_host(vec.data(), list_indices_ptr, listSize, stream); raft_handle.sync_stream(); return vec; } @@ -209,28 +213,33 @@ std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat printf("Inside RaftIVFFlat getListVectorData\n"); FAISS_ASSERT(raft_knn_index.has_value()); - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); + + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); + auto stream = raft_handle.get_stream(); std::cout << "Calling getListVectorData for " << listId << std::endl; - using elem_t = decltype(raft_knn_index.value().data_ptrs())::element_type; - size_t dim = raft_knn_index.value().dim(); - idx_t list_size = getListLength(listId); + idx_t listSize = getListLength(listId); // the interleaved block can be slightly larger than the list size (it's // rounded up) - size_t nblocks = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize); - size_t interleavedCodeSize = nblocks * raft::neighbors::ivf_flat::kIndexGroupSize * dim * sizeof(elem_t); - size_t flat_code_size = list_size * dim * sizeof(elem_t); - std::vector interleaved_codes(interleavedCodeSize); - std::vector flat_codes(flat_code_size); - - RaftIVFFlatCodePackerFlat p(resources_, interleavedCodeSize); - p.unpack_1(reinterpret_cast( - raft_knn_index.value().data_ptrs().data_handle()+listId), 0, interleaved_codes.data()); - RaftIVFFlatCodePackerInterleaved up((size_t)list_size, (size_t)dim, (size_t)raft_knn_index.value().veclen()); - up.unpack_1(interleaved_codes.data(), 0, flat_codes.data()); + auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(listSize); + auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize); + + std::vector interleaved_codes(gpuListSizeInBytes); + std::vector flat_codes(cpuListSizeInBytes); + + float* list_data_ptr; + + // fetch the list data ptr on host + raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft_handle.sync_stream(); + printf("data ptr fetched successfully\n"); + + raft::update_host(interleaved_codes.data(), reinterpret_cast(list_data_ptr), gpuListSizeInBytes, stream); + raft_handle.sync_stream(); + RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen()); + packer.unpack_all(interleaved_codes.data(), flat_codes.data()); return flat_codes; } @@ -265,6 +274,10 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { raft::neighbors::ivf_flat::index_params pams; pams.add_data_on_build = false; + pams.n_lists = this->numLists_; + + printf("numLists %d", pams.n_lists); + switch (this->metric_) { case faiss::METRIC_L2: printf("Using L2!\n"); @@ -280,15 +293,12 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { raft_knn_index.emplace( handle, - pams.metric, - (uint32_t)this->numLists_, - false, - false, + pams, (uint32_t)this->dim_); printf("Reconstructing\n"); // Copy (reconstructed) centroids over, rather than re-training - rmm::device_uvector buf_dev(total_elems, stream); +// rmm::device_uvector buf_dev(total_elems, stream); std::vector buf_host(total_elems); quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); @@ -310,45 +320,66 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { // // void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { + printf("Inside raft's copyInvertedListsFrom\n"); size_t nlist = ivf ? ivf->nlist : 0; size_t ntotal = ivf ? ivf->compute_ntotal() : 0; - raft::device_resources &handle = resources_->getRaftHandleCurrentDevice(); + raft::device_resources &raft_handle = resources_->getRaftHandleCurrentDevice(); std::vector list_sizes_(nlist); - std::vector list_offsets_(nlist+1); std::vector indices_(ntotal); - raft::neighbors::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_n_iters = 100; -// - raft_knn_index.emplace(handle, raft_idx_params, dim_); + // the index must already exist + FAISS_ASSERT(raft_knn_index.has_value()); +// if(!raft_knn_index.has_value()) { +// printf("emplacing because index is null"); +// raft::neighbors::ivf_flat::index_params raft_idx_params; +// raft_idx_params.n_lists = nlist; +// raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; +// raft_idx_params.add_data_on_build = false; +// raft_idx_params.kmeans_n_iters = 100; + +// raft_knn_index.emplace(handle, raft_idx_params, dim_); +// } // raft_knn_index.value().allocate(handle, ntotal, true); -// + auto& raft_lists = raft_knn_index.value().lists(); + + // conservative memory alloc for cloning cpu inverted lists + raft::neighbors::ivf_flat::list_spec raft_list_spec{static_cast(dim_), true}; + for (size_t i = 0; i < nlist; ++i) { - size_t listSize = ivf->list_size(i); - list_sizes_[i] = listSize; - // GPU index can only support max int entries per list + size_t listSize = ivf->list_size(i); + + // GPU index can only support max int entries per list FAISS_THROW_IF_NOT_FMT( listSize <= (size_t)std::numeric_limits::max(), "GPU inverted list can only support " "%zu entries; %zu found", (size_t)std::numeric_limits::max(), listSize); + + // store the list size + list_sizes_[i] = static_cast(listSize); + + raft::neighbors::ivf::resize_list(raft_handle, + raft_lists[i], + raft_list_spec, + (uint32_t)(raft::Pow2::roundUp(listSize)), + (uint32_t)0); + printf("listSize %d\n", listSize); + } - addEncodedVectorsToList_( - i, ivf->get_codes(i), ivf->get_ids(i), listSize); + // Update the pointers and the sizes + raft_knn_index.value().recompute_internal_state(raft_handle); + + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize); } -// - raft::update_device(raft_knn_index.value().list_sizes().data_handle(), - list_sizes_.data(), nlist, handle.get_stream()); -// raft::update_device(raft_knn_index.value().list_offsets().data_handle(), -// list_offsets_.data(), nlist+1, handle.get_stream()); -// + + raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream()); + raft_handle.sync_stream(); } size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { @@ -373,6 +404,7 @@ void RaftIVFFlat::addEncodedVectorsToList_( const void* codes, const idx_t* indices, idx_t numVecs) { + printf("inside addEncodedVectorsToList_ for listId %d\n", listId); auto stream = resources_->getDefaultStreamCurrentDevice(); // This list must already exist @@ -389,6 +421,10 @@ void RaftIVFFlat::addEncodedVectorsToList_( // The GPU might have a different layout of the memory auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); + + printf("numVecs %d\n", numVecs); + printf("gpuListSizeInBytes %d\n", gpuListSizeInBytes); + printf("cpuListSizeInBytes %d\n", cpuListSizeInBytes); // We only have int32 length representations on the GPU per each // list; the length is in sizeof(char) @@ -396,25 +432,40 @@ void RaftIVFFlat::addEncodedVectorsToList_( (size_t)std::numeric_limits::max()); // Translate the codes as needed to our preferred form - std::vector codesV(cpuListSizeInBytes); - std::memcpy(codesV.data(), codes, cpuListSizeInBytes); +// std::vector codesV(cpuListSizeInBytes); +// std::memcpy(codesV.data(), codes, cpuListSizeInBytes); // auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); - std::cout << "numVecs=" << numVecs << "gpuListSizeInBytes=" << - gpuListSizeInBytes << std::endl; -// utils::divUp(numVecs, 32); - RaftIVFFlatCodePackerInterleaved transform_packer((size_t)numVecs, (size_t)dim_, (size_t)raft_knn_index.value().veclen()); - std::vector interleaved_codes(gpuListSizeInBytes); - std::memcpy(codesV.data(), codes, cpuListSizeInBytes); - transform_packer.pack_1(codesV.data(), 0, interleaved_codes.data()); - RaftIVFFlatCodePackerFlat copy_packer(resources_, cpuListSizeInBytes); - copy_packer.unpack_1(interleaved_codes.data(), 0, (uint8_t*)(raft_knn_index.value().data_ptrs().data_handle() + listId)); + std::vector interleaved_codes(gpuListSizeInBytes); +printf("dim %d\n", dim_); +printf("veclen %d\n", raft_knn_index.value().veclen()); + RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen()); + + printf("Allocated interleaved codes\n"); + packer.pack_all(reinterpret_cast(codes), interleaved_codes.data()); + printf("packing done\n"); - uint32_t size = numVecs; - raft::update_device(raft_knn_index.value().list_sizes().data_handle() + listId, &size, 1, stream); + float* list_data_ptr; + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); + + // fetch the list data ptr on host + raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft_handle.sync_stream(); + printf("data ptr fetched successfully\n"); + + raft::update_device(reinterpret_cast(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream); + raft_handle.sync_stream(); + printf("copied to gpu\n"); // Handle the indices as well - raft::update_device((idx_t*)(raft_knn_index.value().inds_ptrs().data_handle() + listId), indices, numVecs, stream); + idx_t* list_indices_ptr; + + // fetch the list indices ptr on host + raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft_handle.sync_stream(); + raft::update_device(list_indices_ptr, indices, numVecs, stream); + raft_handle.sync_stream(); + printf("Done copying indices\n"); } /// Copy all inverted lists from ourselves to a CPU representation void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { @@ -455,55 +506,34 @@ void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { -RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen) { - this->list_size = list_size; +RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) { this->dim = dim; - this->veclen = veclen; - nvec = 1; - code_size = list_size * dim * sizeof(uint32_t); - block_size = utils::divUp(list_size, raft::neighbors::ivf_flat::kIndexGroupSize); + this->chunk_size = chunk_size; + // NB: dim should be divisible by the number of 4 byte records in one chunk + FAISS_ASSERT(dim % chunk_size == 0); + nvec = list_size; + code_size = dim * 4; + block_size = utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize); } void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { - FAISS_ASSERT(offset == 0); - raft::neighbors::ivf_flat::helpers::pack_host_interleaved( - flat_code, - block, - nvec, + printf("packing offset %zu\n", offset); + raft::neighbors::ivf_flat::codepacker::pack_1( + reinterpret_cast(flat_code), + reinterpret_cast(block), dim, - veclen); + chunk_size, + static_cast(offset)); } void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const { - FAISS_ASSERT(offset == 0); - raft::neighbors::ivf_flat::helpers::unpack_host_interleaved( - block, - flat_code, - nvec, + raft::neighbors::ivf_flat::codepacker::unpack_1( + reinterpret_cast(block), + reinterpret_cast(flat_code), dim, - veclen); -} - -RaftIVFFlatCodePackerFlat::RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size) { - this->resources = resources_; - nvec = 1; - code_size = code_size; - block_size = code_size; -} - -void RaftIVFFlatCodePackerFlat::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { - FAISS_ASSERT(offset == 0); - const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice(); - raft::update_device(block, flat_code, code_size * nvec, raft_handle.get_stream()); + chunk_size, + static_cast(offset)); } -void RaftIVFFlatCodePackerFlat::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const { - FAISS_ASSERT(offset == 0); - const raft::device_resources& raft_handle = resources->getRaftHandleCurrentDevice(); - raft::update_host(flat_code, block, code_size * nvec, raft_handle.get_stream()); - raft_handle.sync_stream(); -} - - } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 8f81bd8c42..b85d503da3 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -130,28 +130,16 @@ class RaftIVFFlat : public IVFFlat { struct RaftIVFFlatCodePackerInterleaved : CodePacker { - RaftIVFFlatCodePackerInterleaved(size_t list_size, size_t dim, size_t veclen); + RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chuk_size); void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const final; void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const final; protected: - size_t list_size; - size_t veclen; - size_t dim; + uint32_t chunk_size; + uint32_t dim; }; -struct RaftIVFFlatCodePackerFlat : CodePacker { - RaftIVFFlatCodePackerFlat(GpuResources* resources_, size_t code_size); - void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) - const final; - void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) - const final; - - protected: - GpuResources* resources; -}; - } // namespace gpu } // namespace faiss From 444c58d69af4882464e035d364a428859957449e Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 31 Jul 2023 16:51:21 -0700 Subject: [PATCH 65/87] Passing tests --- faiss/gpu/GpuIndexIVF.cu | 3 +- faiss/gpu/impl/RaftIVFFlat.cu | 145 +++++-------------------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 6 - 3 files changed, 31 insertions(+), 123 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index dfc9631f7e..3b21bddefd 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -457,7 +457,8 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; raft_idx_params.add_data_on_build = false; raft_idx_params.kmeans_trainset_fraction = 1.0; - raft_idx_params.kmeans_n_iters = 100; + raft_idx_params.kmeans_n_iters = cp.niter; + raft_idx_params.adaptive_centers = !cp.frozen_centroids; auto raft_index = raft::neighbors::ivf_flat::build( raft_handle, raft_idx_params, x, n, (idx_t)d); diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index e543d0ff5f..3a0bc78aeb 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -87,7 +87,6 @@ void RaftIVFFlat::search( int k, Tensor& outDistances, Tensor& outIndices) { - printf("Inside RaftIVFFlat search()\n"); // TODO: We probably don't want to ignore the coarse quantizer here... @@ -114,7 +113,7 @@ void RaftIVFFlat::search( raft::neighbors::ivf_flat::search( raft_handle, pams, - *raft_knn_index, + raft_knn_index.value(), queries_view, out_inds_view, out_dists_view); @@ -130,9 +129,6 @@ idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { - printf("Inside RaftIVFFlat addVectors()\n"); - - raft::print_device_vector("add_vectors", vecs.data(), 50, std::cout); auto vecs_view = raft::make_device_matrix_view( vecs.data(), vecs.getSize(0), dim_); @@ -142,7 +138,6 @@ idx_t RaftIVFFlat::addVectors( const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - printf("About to call extend on index\n"); // TODO: We probably don't want to ignore the coarse quantizer here if (raft_knn_index.has_value()) { @@ -154,20 +149,15 @@ idx_t RaftIVFFlat::addVectors( inds_view), raft_knn_index.value())); - } else { - printf("Index has not been trained!\n"); } - printf("Done.\n"); return vecs.getSize(0); } void RaftIVFFlat::reset() { - printf("Inside RaftIVFFlat reset()\n"); raft_knn_index.reset(); } idx_t RaftIVFFlat::getListLength(idx_t listId) const { - printf("Inside RaftIVFFlat getListLength\n"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = @@ -185,7 +175,6 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const { /// Return the list indices of a particular list back to the CPU std::vector RaftIVFFlat::getListIndices(idx_t listId) const { - printf("Inside RaftIVFFlat getListIndices\n"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = @@ -199,7 +188,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { idx_t* list_indices_ptr; // fetch the list indices ptr on host - raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); raft_handle.sync_stream(); raft::update_host(vec.data(), list_indices_ptr, listSize, stream); @@ -210,15 +199,12 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { /// Return the encoded vectors of a particular list back to the CPU std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat) const { - printf("Inside RaftIVFFlat getListVectorData\n"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); auto stream = raft_handle.get_stream(); - std::cout << "Calling getListVectorData for " << listId << std::endl; - idx_t listSize = getListLength(listId); // the interleaved block can be slightly larger than the list size (it's @@ -229,15 +215,15 @@ std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat std::vector interleaved_codes(gpuListSizeInBytes); std::vector flat_codes(cpuListSizeInBytes); - float* list_data_ptr; + float* list_data_ptr; // fetch the list data ptr on host - raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); raft_handle.sync_stream(); - printf("data ptr fetched successfully\n"); raft::update_host(interleaved_codes.data(), reinterpret_cast(list_data_ptr), gpuListSizeInBytes, stream); raft_handle.sync_stream(); + RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen()); packer.unpack_all(interleaved_codes.data(), flat_codes.data()); return flat_codes; @@ -254,16 +240,12 @@ void RaftIVFFlat::searchPreassigned( Tensor& outDistances, Tensor& outIndices, bool storePairs) { - printf("Inside RaftIVFFlat searchPreassigned\n"); - // TODO: Fill this in! } void RaftIVFFlat::updateQuantizer(Index* quantizer) { idx_t quantizer_ntotal = quantizer->ntotal; - std::cout << "Calling RAFT updateQuantizer with trained index with " - << quantizer_ntotal << " items" << std::endl; const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice(); auto stream = handle.get_stream(); @@ -276,15 +258,11 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { pams.n_lists = this->numLists_; - printf("numLists %d", pams.n_lists); - switch (this->metric_) { case faiss::METRIC_L2: - printf("Using L2!\n"); pams.metric = raft::distance::DistanceType::L2Expanded; break; case faiss::METRIC_INNER_PRODUCT: - printf("Using Inner product!\n"); pams.metric = raft::distance::DistanceType::InnerProduct; break; default: @@ -296,31 +274,20 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { pams, (uint32_t)this->dim_); - printf("Reconstructing\n"); - // Copy (reconstructed) centroids over, rather than re-training -// rmm::device_uvector buf_dev(total_elems, stream); + /// Copy (reconstructed) centroids over, rather than re-training std::vector buf_host(total_elems); quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - printf("Copying...\n"); - raft::update_device( raft_knn_index.value().centers().data_handle(), buf_host.data(), total_elems, stream); - - raft::print_device_vector( - "raft centers", - raft_knn_index.value().centers().data_handle(), - this->dim_, - std::cout); } // // void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { - printf("Inside raft's copyInvertedListsFrom\n"); size_t nlist = ivf ? ivf->nlist : 0; size_t ntotal = ivf ? ivf->compute_ntotal() : 0; @@ -331,17 +298,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { // the index must already exist FAISS_ASSERT(raft_knn_index.has_value()); -// if(!raft_knn_index.has_value()) { -// printf("emplacing because index is null"); -// raft::neighbors::ivf_flat::index_params raft_idx_params; -// raft_idx_params.n_lists = nlist; -// raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; -// raft_idx_params.add_data_on_build = false; -// raft_idx_params.kmeans_n_iters = 100; - -// raft_knn_index.emplace(handle, raft_idx_params, dim_); -// } -// raft_knn_index.value().allocate(handle, ntotal, true); + auto& raft_lists = raft_knn_index.value().lists(); // conservative memory alloc for cloning cpu inverted lists @@ -365,21 +322,32 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { raft::neighbors::ivf::resize_list(raft_handle, raft_lists[i], raft_list_spec, - (uint32_t)(raft::Pow2::roundUp(listSize)), + (uint32_t)listSize, (uint32_t)0); - printf("listSize %d\n", listSize); } // Update the pointers and the sizes raft_knn_index.value().recompute_internal_state(raft_handle); for (size_t i = 0; i < nlist; ++i) { - size_t listSize = ivf->list_size(i); - addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize); - } + size_t listSize = ivf->list_size(i); + addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize); + } raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream()); - raft_handle.sync_stream(); + + // Precompute the centers vector norms for L2Expanded distance + if (this->metric_ == faiss::METRIC_L2) { + raft_knn_index.value().allocate_center_norms(raft_handle); + raft::linalg::rowNorm(raft_knn_index.value().center_norms()->data_handle(), + raft_knn_index.value().centers().data_handle(), + raft_knn_index.value().dim(), + (uint32_t)nlist, + raft::linalg::L2Norm, + true, + raft_handle.get_stream()); + } + raft_handle.sync_stream(); } size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { @@ -404,7 +372,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( const void* codes, const idx_t* indices, idx_t numVecs) { - printf("inside addEncodedVectorsToList_ for listId %d\n", listId); auto stream = resources_->getDefaultStreamCurrentDevice(); // This list must already exist @@ -421,91 +388,37 @@ void RaftIVFFlat::addEncodedVectorsToList_( // The GPU might have a different layout of the memory auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); - - printf("numVecs %d\n", numVecs); - printf("gpuListSizeInBytes %d\n", gpuListSizeInBytes); - printf("cpuListSizeInBytes %d\n", cpuListSizeInBytes); // We only have int32 length representations on the GPU per each // list; the length is in sizeof(char) FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); - // Translate the codes as needed to our preferred form -// std::vector codesV(cpuListSizeInBytes); -// std::memcpy(codesV.data(), codes, cpuListSizeInBytes); -// auto translatedCodes = translateCodesToGpu_(std::move(codesV), numVecs); - std::vector interleaved_codes(gpuListSizeInBytes); -printf("dim %d\n", dim_); -printf("veclen %d\n", raft_knn_index.value().veclen()); RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen()); - printf("Allocated interleaved codes\n"); packer.pack_all(reinterpret_cast(codes), interleaved_codes.data()); - printf("packing done\n"); float* list_data_ptr; const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - // fetch the list data ptr on host - raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + /// fetch the list data ptr on host + raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); raft_handle.sync_stream(); - printf("data ptr fetched successfully\n"); raft::update_device(reinterpret_cast(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream); raft_handle.sync_stream(); - printf("copied to gpu\n"); - // Handle the indices as well + /// Handle the indices as well idx_t* list_indices_ptr; // fetch the list indices ptr on host - raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); // Copy the pointer to the first array from device to host + raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); raft_handle.sync_stream(); raft::update_device(list_indices_ptr, indices, numVecs, stream); raft_handle.sync_stream(); - printf("Done copying indices\n"); -} -/// Copy all inverted lists from ourselves to a CPU representation -void RaftIVFFlat::copyInvertedListsTo(InvertedLists* ivf) { - printf("Inside RaftIVFFlat copyInvertedListsTo\n"); - - for (idx_t i = 0; i < numLists_; ++i) { - auto listIndices = getListIndices(i); - auto listData = getListVectorData(i, false); - - ivf->add_entries( - i, listIndices.size(), listIndices.data(), listData.data()); - } } -// std::vector RaftIVFFlat::translateCodesToGpu_( -// std::vector codes, -// std::vector block, -// idx_t numVecs) const { -// if (!interleavedLayout_) { -// // same format -// return codes; -// } -// RaftIVFFlatCodePackerInterleaved packer; -// packer::pack_all(codes.data(), block.data()); -// } - -// std::vector RaftIVFFlat::translateCodesFromGpu_( -// std::vector codes, -// idx_t numVecs) const { -// if (!interleavedLayout_) { -// // same format -// return codes; -// } - -// RaftIVFFlatCodePackerFlat packer; -// packer::unpack_all(block.data(), codes.data()); -// } - - - RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) { this->dim = dim; this->chunk_size = chunk_size; @@ -517,7 +430,7 @@ RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_s } void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { - printf("packing offset %zu\n", offset); + // printf("packing offset %zu\n", offset); raft::neighbors::ivf_flat::codepacker::pack_1( reinterpret_cast(flat_code), reinterpret_cast(block), diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index ba8638f010..a0f414a773 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -139,12 +139,6 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - printf("original add vectors: ["); - for (int i = 0; i < 50; ++i) { - printf("%f, ", addVecs[i]); - } - printf("]\n"); - faiss::IndexFlatL2 quantizerL2(opt.dim); faiss::IndexFlatIP quantizerIP(opt.dim); faiss::Index* quantizer = metricType == faiss::METRIC_L2 From f148f09a78088a2a26fe788c9c4f26c92d358e9e Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 3 Aug 2023 17:28:13 -0700 Subject: [PATCH 66/87] Passing copyTo --- build.sh | 1 + faiss/gpu/GpuIndex.cu | 2 ++ faiss/gpu/GpuIndexIVF.cu | 47 ++++++-------------------- faiss/gpu/GpuIndexIVFFlat.cu | 32 +++++++++++++++--- faiss/gpu/impl/RaftIVFFlat.cu | 15 ++++++-- faiss/gpu/impl/RaftIVFFlat.cuh | 8 +++-- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 32 +++++++++++------- 7 files changed, 78 insertions(+), 59 deletions(-) diff --git a/build.sh b/build.sh index 5a0c3c58da..6a353379f8 100755 --- a/build.sh +++ b/build.sh @@ -16,6 +16,7 @@ fi if [ "$1" == "clean" ]; then rm -rf build + rm -rf .cache exit 0 fi diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 89952b1121..53a0179334 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -102,6 +102,8 @@ size_t GpuIndex::getMinPagingSize() const { void GpuIndex::add(idx_t n, const float* x) { // Pass to add_with_ids + printf("add called with n = %d\n", n); + raft::print_host_vector("x", x, 5, std::cout); add_with_ids(n, x, nullptr); } diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 3b21bddefd..159b3730cd 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -5,11 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include -#include - #include #include #include @@ -268,6 +263,7 @@ void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) { // Device is already set in GpuIndex::add FAISS_ASSERT(baseIndex_); FAISS_ASSERT(n > 0); + printf("addVectors called from gpuindexivf"); // Data is already resident on the GPU Tensor data(const_cast(x), {n, this->d}); @@ -445,41 +441,18 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { return; } - printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); - - if (config_.use_raft) { - printf("Using raft to train quantizer for %d vectors\n", n); - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); - - raft::neighbors::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_trainset_fraction = 1.0; - raft_idx_params.kmeans_n_iters = cp.niter; - raft_idx_params.adaptive_centers = !cp.frozen_centroids; - - auto raft_index = raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (idx_t)d); - - raft_handle.sync_stream(); - - // TODO: Validate this is all we need to do - quantizer->reset(); - quantizer->train(nlist, raft_index.centers().data_handle()); - quantizer->add(nlist, raft_index.centers().data_handle()); - - } else { - // leverage the CPU-side k-means code, which works for the GPU - // flat index as well - quantizer->reset(); - Clustering clus(this->d, nlist, this->cp); - clus.verbose = verbose; - clus.train(n, x, *quantizer); + if (this->verbose) { + printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); } + // leverage the CPU-side k-means code, which works for the GPU + // flat index as well + quantizer->reset(); + Clustering clus(this->d, nlist, this->cp); + clus.verbose = verbose; + clus.train(n, x, *quantizer); quantizer->is_trained = true; + FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 7d2a94d595..997e4bfa1c 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -18,6 +18,11 @@ #include #include +#include +#include +#include +#include + #include namespace faiss { @@ -85,8 +90,6 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); - baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); } } @@ -235,6 +238,7 @@ void GpuIndexIVFFlat::updateQuantizer() { } void GpuIndexIVFFlat::train(idx_t n, const float* x) { + printf("Inside train"); DeviceScope scope(config_.device); // just in case someone changed our quantizer @@ -250,12 +254,14 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { // FIXME: GPUize more of this // First, make sure that the data is resident on the CPU, if it is not on // the CPU, as we depend upon parts of the CPU code + if (!config_.use_raft) { auto hostData = toHost( (float*)x, resources_->getDefaultStream(config_.device), {n, this->d}); trainQuantizer_(n, hostData.data()); + } // The quantizer is now trained; construct the IVF index set_index_( @@ -269,11 +275,29 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); - - if (reserveMemoryVecs_) { + + if (!config_.use_raft && reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } + if (config_.use_raft) { + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_trainset_fraction = 1.0; + raft_idx_params.kmeans_n_iters = cp.niter; + raft_idx_params.adaptive_centers = !cp.frozen_centroids; + + printf("raft_idx_params.k_means_n_iters %u\n", cp.niter); + + std::dynamic_pointer_cast(index_)->set_index_(std::make_optional>(raft::neighbors::ivf_flat::build( + raft_handle, raft_idx_params, x, n, (idx_t)d))); + } + this->is_trained = true; } diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 3a0bc78aeb..3bab3b67fa 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -130,17 +130,25 @@ idx_t RaftIVFFlat::addVectors( Tensor& vecs, Tensor& indices) { + raft::print_device_vector("vecs", vecs.data(), 5, std::cout); + raft::print_device_vector("indices", indices.data(), indices.getSize(0), std::cout); + auto vecs_view = raft::make_device_matrix_view( vecs.data(), vecs.getSize(0), dim_); auto inds_view = raft::make_device_vector_view( indices.data(), (idx_t)indices.getSize(0)); + + printf("vecs.getSize(0) %d", vecs.getSize(0)); + printf("indices.getSize(0) %d", indices.getSize(0)); + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); // TODO: We probably don't want to ignore the coarse quantizer here - if (raft_knn_index.has_value()) { + FAISS_ASSERT(raft_knn_index.has_value()); +// cudaMemcpyAsync(raft_knn_index.value().centers().data_handle(), coarseQuantizer.codes.data(), raft_knn_index.value().n_lists() * dim_ * sizeof(float), cudaMemcpyDefault, raft_handle.get_stream()); raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( raft_handle, vecs_view, @@ -149,7 +157,6 @@ idx_t RaftIVFFlat::addVectors( inds_view), raft_knn_index.value())); - } return vecs.getSize(0); } @@ -350,6 +357,10 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { raft_handle.sync_stream(); } +void RaftIVFFlat::set_index_(std::optional> idx) { + raft_knn_index.emplace(std::move(idx.value())); +} + size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { idx_t bits = 32 /* float */; diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index b85d503da3..30a7378570 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -84,6 +84,9 @@ class RaftIVFFlat : public IVFFlat { Tensor& vecs, Tensor& indices) override; + /// Reserve GPU memory in our inverted lists for this number of vectors +// void reserveMemory(idx_t numVecs) override; + /// Clear out all inverted lists, but retain the coarse quantizer /// and the product quantizer info void reset() override; @@ -103,10 +106,9 @@ class RaftIVFFlat : public IVFFlat { /// Copy all inverted lists from a CPU representation to ourselves void copyInvertedListsFrom(const InvertedLists* ivf) override; - - /// Copy all inverted lists from ourselves to a CPU representation - void copyInvertedListsTo(InvertedLists* ivf) override; + void set_index_(std::optional> idx); + protected: /// Adds a set of codes and indices to a list, with the representation /// coming from the CPU equivalent diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index a0f414a773..e84fbd665c 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -232,6 +232,7 @@ void copyToTest(bool useFloat16CoarseQuantizer) { compFloat16 ? 0.30f : 0.015f); } + void copyFromTest(bool useFloat16CoarseQuantizer) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); @@ -399,6 +400,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); @@ -447,6 +449,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); @@ -485,6 +488,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); @@ -505,19 +509,19 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { // should not crash EXPECT_EQ(gpuIndex.ntotal, 0); - gpuIndex.add(numNans, nans.data()); - - std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - std::vector distance(opt.numQuery * opt.k, 0); - std::vector indices(opt.numQuery * opt.k, 0); - - // should not crash - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - distance.data(), - indices.data()); + // gpuIndex.add(numNans, nans.data()); + + // std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + // std::vector distance(opt.numQuery * opt.k, 0); + // std::vector indices(opt.numQuery * opt.k, 0); + + // // should not crash + // gpuIndex.search( + // opt.numQuery, + // queryVecs.data(), + // opt.k, + // distance.data(), + // indices.data()); } TEST(TestGpuIndexIVFFlat, UnifiedMemory) { @@ -558,6 +562,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; config.memorySpace = faiss::gpu::MemorySpace::Unified; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); @@ -615,6 +620,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); From 575650864a782368a1f54ce10eb7cfb2c2b53d9b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 16:37:50 -0700 Subject: [PATCH 67/87] All tests passing --- faiss/gpu/GpuIndexIVF.cu | 1 - faiss/gpu/GpuIndexIVFFlat.cu | 1 - faiss/gpu/impl/IVFFlat.cuh | 2 +- faiss/gpu/impl/RaftIVFFlat.cu | 462 ++++++++++++++++--------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 26 +- 5 files changed, 310 insertions(+), 182 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 159b3730cd..f2ed323605 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -263,7 +263,6 @@ void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) { // Device is already set in GpuIndex::add FAISS_ASSERT(baseIndex_); FAISS_ASSERT(n > 0); - printf("addVectors called from gpuindexivf"); // Data is already resident on the GPU Tensor data(const_cast(x), {n, this->d}); diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 997e4bfa1c..3458177dd3 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -76,7 +76,6 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( reserveMemoryVecs_(0) { // We could have been passed an already trained coarse quantizer. There is // no other quantizer that we need to train, so this is sufficient - if (this->is_trained) { FAISS_ASSERT(this->quantizer); set_index_( diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh index 1c3048490e..726d62c1da 100644 --- a/faiss/gpu/impl/IVFFlat.cuh +++ b/faiss/gpu/impl/IVFFlat.cuh @@ -7,7 +7,7 @@ #pragma once -#ifdef FAISS_ENABLE_RAFT +#if defined USE_NVIDIA_RAFT #include #include #endif diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 3bab3b67fa..9a08fb7d51 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -21,6 +21,8 @@ */ #include #include +#include +#include #include #include @@ -45,7 +47,6 @@ #include #include - #include #include @@ -87,12 +88,11 @@ void RaftIVFFlat::search( int k, Tensor& outDistances, Tensor& outIndices) { - // TODO: We probably don't want to ignore the coarse quantizer here... - std::uint32_t n = queries.getSize(0); - std::uint32_t cols = queries.getSize(1); - std::uint32_t k_ = k; + uint32_t n = queries.getSize(0); + uint32_t cols = queries.getSize(1); + uint32_t k_ = k; // Device is already set in GpuIndex::search FAISS_ASSERT(raft_knn_index.has_value()); @@ -104,19 +104,66 @@ void RaftIVFFlat::search( raft::neighbors::ivf_flat::search_params pams; pams.n_probes = nprobe; - auto queries_view = - raft::make_device_matrix_view(queries.data(), n, cols); + uint32_t n_rows = n; + + auto nan_flag = raft::make_device_vector(raft_handle, n_rows); + + thrust::fill_n( + raft_handle.get_thrust_policy(), + nan_flag.data_handle(), + n_rows, + true); + raft::linalg::map_offset( + raft_handle, + nan_flag.view(), + [queries = queries.data(), dim_ = this->dim_] __device__(idx_t i) { + for (idx_t col = 0; col < dim_; col++) { + if (!isfinite(queries[i * dim_ + col])) { + return false; + } + } + return true; + }); + + // TODO: We probably don't want to ignore the coarse quantizer here + + auto queries_view = raft::make_device_matrix_view( + queries.data(), n_rows, cols); auto out_inds_view = - raft::make_device_matrix_view(outIndices.data(), n, k_); - auto out_dists_view = - raft::make_device_matrix_view(outDistances.data(), n, k_); + raft::make_device_matrix_view(outIndices.data(), n_rows, k_); + auto out_dists_view = raft::make_device_matrix_view( + outDistances.data(), n_rows, k_); raft::neighbors::ivf_flat::search( raft_handle, - pams, + pams, raft_knn_index.value(), queries_view, out_inds_view, out_dists_view); + float max_val = std::numeric_limits::max(); + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outIndices.data(), n_rows * k_), + [nan_flag = nan_flag.data_handle(), + out_inds = outIndices.data(), + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return idx_t(-1); + return out_inds[i]; + }); + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outDistances.data(), n_rows * k_), + [nan_flag = nan_flag.data_handle(), + out_dists = outDistances.data(), + max_val, + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return max_val; + return out_dists[i]; + }); raft_handle.sync_stream(); } @@ -129,35 +176,78 @@ idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { + idx_t n_rows = vecs.getSize(0); - raft::print_device_vector("vecs", vecs.data(), 5, std::cout); - raft::print_device_vector("indices", indices.data(), indices.getSize(0), std::cout); - - auto vecs_view = raft::make_device_matrix_view( - vecs.data(), vecs.getSize(0), dim_); - auto inds_view = raft::make_device_vector_view( - indices.data(), (idx_t)indices.getSize(0)); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + /// Remove NaN values + auto nan_flag = raft::make_device_vector(raft_handle, n_rows); - printf("vecs.getSize(0) %d", vecs.getSize(0)); - printf("indices.getSize(0) %d", indices.getSize(0)); + thrust::fill_n( + raft_handle.get_thrust_policy(), + nan_flag.data_handle(), + n_rows, + true); + raft::linalg::map_offset( + raft_handle, + nan_flag.view(), + [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) { + for (idx_t col = 0; col < dim_; col++) { + if (!isfinite(vecs[i * dim_ + col])) { + return false; + } + } + return true; + }); + raft_handle.sync_stream(); + idx_t n_rows_valid = thrust::reduce( + raft_handle.get_thrust_policy(), + nan_flag.data_handle(), + nan_flag.data_handle() + n_rows, + 0); + auto gather_indices = + raft::make_device_vector(raft_handle, n_rows_valid); + auto count = thrust::make_counting_iterator(0); + thrust::copy_if( + raft_handle.get_thrust_policy(), + count, + count + n_rows, + gather_indices.data_handle(), + [nan_flag = nan_flag.data_handle()] __device__(auto i) { + return nan_flag[i]; + }); + if (n_rows_valid < n_rows) { + raft::matrix::gather( + raft_handle, + raft::make_device_matrix_view( + vecs.data(), n_rows, dim_), + raft::make_const_mdspan(gather_indices.view()), + (idx_t)16); + } + auto valid_indices = + raft::make_device_vector(raft_handle, n_rows); - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); + raft::matrix::gather( + raft_handle, + raft::make_device_matrix_view( + indices.data(), n_rows, (idx_t)1), + raft::make_const_mdspan(gather_indices.view()), + raft::make_device_matrix_view( + valid_indices.data_handle(), n_rows_valid, (idx_t)1)); - // TODO: We probably don't want to ignore the coarse quantizer here + /// TODO: We probably don't want to ignore the coarse quantizer here FAISS_ASSERT(raft_knn_index.has_value()); -// cudaMemcpyAsync(raft_knn_index.value().centers().data_handle(), coarseQuantizer.codes.data(), raft_knn_index.value().n_lists() * dim_ * sizeof(float), cudaMemcpyDefault, raft_handle.get_stream()); - raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( - raft_handle, - vecs_view, - std::make_optional< - raft::device_vector_view>( - inds_view), - raft_knn_index.value())); + raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( + raft_handle, + raft::make_device_matrix_view( + vecs.data(), n_rows_valid, dim_), + std::make_optional>( + valid_indices.view()), + raft_knn_index.value())); - return vecs.getSize(0); + return n_rows_valid; } void RaftIVFFlat::reset() { @@ -165,7 +255,6 @@ void RaftIVFFlat::reset() { } idx_t RaftIVFFlat::getListLength(idx_t listId) const { - FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); @@ -182,7 +271,6 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const { /// Return the list indices of a particular list back to the CPU std::vector RaftIVFFlat::getListIndices(idx_t listId) const { - FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); @@ -195,7 +283,11 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { idx_t* list_indices_ptr; // fetch the list indices ptr on host - raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); + raft::update_host( + &list_indices_ptr, + raft_knn_index.value().inds_ptrs().data_handle() + listId, + 1, + stream); raft_handle.sync_stream(); raft::update_host(vec.data(), list_indices_ptr, listSize, stream); @@ -204,12 +296,13 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { } /// Return the encoded vectors of a particular list back to the CPU -std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat) - const { - +std::vector RaftIVFFlat::getListVectorData( + idx_t listId, + bool gpuFormat) const { FAISS_ASSERT(raft_knn_index.has_value()); - const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); auto stream = raft_handle.get_stream(); idx_t listSize = getListLength(listId); @@ -224,14 +317,23 @@ std::vector RaftIVFFlat::getListVectorData(idx_t listId, bool gpuFormat float* list_data_ptr; - // fetch the list data ptr on host - raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); + // fetch the list data ptr on host + raft::update_host( + &list_data_ptr, + raft_knn_index.value().data_ptrs().data_handle() + listId, + 1, + stream); raft_handle.sync_stream(); - raft::update_host(interleaved_codes.data(), reinterpret_cast(list_data_ptr), gpuListSizeInBytes, stream); + raft::update_host( + interleaved_codes.data(), + reinterpret_cast(list_data_ptr), + gpuListSizeInBytes, + stream); raft_handle.sync_stream(); - RaftIVFFlatCodePackerInterleaved packer((size_t)listSize, dim_, raft_knn_index.value().veclen()); + RaftIVFFlatCodePackerInterleaved packer( + (size_t)listSize, dim_, raft_knn_index.value().veclen()); packer.unpack_all(interleaved_codes.data(), flat_codes.data()); return flat_codes; } @@ -253,7 +355,8 @@ void RaftIVFFlat::searchPreassigned( void RaftIVFFlat::updateQuantizer(Index* quantizer) { idx_t quantizer_ntotal = quantizer->ntotal; - const raft::device_resources& handle = resources_->getRaftHandleCurrentDevice(); + const raft::device_resources& handle = + resources_->getRaftHandleCurrentDevice(); auto stream = handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); @@ -276,10 +379,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace( - handle, - pams, - (uint32_t)this->dim_); + raft_knn_index.emplace(handle, pams, (uint32_t)this->dim_); /// Copy (reconstructed) centroids over, rather than re-training std::vector buf_host(total_elems); @@ -292,171 +392,201 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { stream); } -// -// void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { - size_t nlist = ivf ? ivf->nlist : 0; - size_t ntotal = ivf ? ivf->compute_ntotal() : 0; - - raft::device_resources &raft_handle = resources_->getRaftHandleCurrentDevice(); + size_t nlist = ivf ? ivf->nlist : 0; + size_t ntotal = ivf ? ivf->compute_ntotal() : 0; - std::vector list_sizes_(nlist); - std::vector indices_(ntotal); + raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); - // the index must already exist - FAISS_ASSERT(raft_knn_index.has_value()); + std::vector list_sizes_(nlist); + std::vector indices_(ntotal); - auto& raft_lists = raft_knn_index.value().lists(); + // the index must already exist + FAISS_ASSERT(raft_knn_index.has_value()); - // conservative memory alloc for cloning cpu inverted lists - raft::neighbors::ivf_flat::list_spec raft_list_spec{static_cast(dim_), true}; + auto& raft_lists = raft_knn_index.value().lists(); - for (size_t i = 0; i < nlist; ++i) { + // conservative memory alloc for cloning cpu inverted lists + raft::neighbors::ivf_flat::list_spec raft_list_spec{ + static_cast(dim_), true}; + for (size_t i = 0; i < nlist; ++i) { size_t listSize = ivf->list_size(i); // GPU index can only support max int entries per list - FAISS_THROW_IF_NOT_FMT( - listSize <= (size_t)std::numeric_limits::max(), - "GPU inverted list can only support " - "%zu entries; %zu found", - (size_t)std::numeric_limits::max(), - listSize); - + FAISS_THROW_IF_NOT_FMT( + listSize <= (size_t)std::numeric_limits::max(), + "GPU inverted list can only support " + "%zu entries; %zu found", + (size_t)std::numeric_limits::max(), + listSize); + // store the list size list_sizes_[i] = static_cast(listSize); - raft::neighbors::ivf::resize_list(raft_handle, - raft_lists[i], - raft_list_spec, - (uint32_t)listSize, - (uint32_t)0); - } - - // Update the pointers and the sizes - raft_knn_index.value().recompute_internal_state(raft_handle); - - for (size_t i = 0; i < nlist; ++i) { - size_t listSize = ivf->list_size(i); - addEncodedVectorsToList_(i, ivf->get_codes(i), ivf->get_ids(i), listSize); - } - - raft::update_device(raft_knn_index.value().list_sizes().data_handle(), list_sizes_.data(), nlist, raft_handle.get_stream()); - - // Precompute the centers vector norms for L2Expanded distance - if (this->metric_ == faiss::METRIC_L2) { - raft_knn_index.value().allocate_center_norms(raft_handle); - raft::linalg::rowNorm(raft_knn_index.value().center_norms()->data_handle(), - raft_knn_index.value().centers().data_handle(), - raft_knn_index.value().dim(), - (uint32_t)nlist, - raft::linalg::L2Norm, - true, - raft_handle.get_stream()); - } - raft_handle.sync_stream(); + raft::neighbors::ivf::resize_list( + raft_handle, + raft_lists[i], + raft_list_spec, + (uint32_t)listSize, + (uint32_t)0); + } + + // Update the pointers and the sizes + raft_knn_index.value().recompute_internal_state(raft_handle); + + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + addEncodedVectorsToList_( + i, ivf->get_codes(i), ivf->get_ids(i), listSize); + } + + raft::update_device( + raft_knn_index.value().list_sizes().data_handle(), + list_sizes_.data(), + nlist, + raft_handle.get_stream()); + + // Precompute the centers vector norms for L2Expanded distance + if (this->metric_ == faiss::METRIC_L2) { + raft_knn_index.value().allocate_center_norms(raft_handle); + raft::linalg::rowNorm( + raft_knn_index.value().center_norms()->data_handle(), + raft_knn_index.value().centers().data_handle(), + raft_knn_index.value().dim(), + (uint32_t)nlist, + raft::linalg::L2Norm, + true, + raft_handle.get_stream()); + } + raft_handle.sync_stream(); } -void RaftIVFFlat::set_index_(std::optional> idx) { - raft_knn_index.emplace(std::move(idx.value())); +void RaftIVFFlat::set_index_( + std::optional> idx) { + raft_knn_index.emplace(std::move(idx.value())); } size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { - idx_t bits = 32 /* float */; + idx_t bits = 32 /* float */; - // bytes to encode a block of 32 vectors (single dimension) - idx_t bytesPerDimBlock = bits * 32 / 8; // = 128 + // bytes to encode a block of 32 vectors (single dimension) + idx_t bytesPerDimBlock = bits * 32 / 8; // = 128 - // bytes to fully encode 32 vectors - idx_t bytesPerBlock = bytesPerDimBlock * dim_; + // bytes to fully encode 32 vectors + idx_t bytesPerBlock = bytesPerDimBlock * dim_; - // number of blocks of 32 vectors we have - idx_t numBlocks = utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize); + // number of blocks of 32 vectors we have + idx_t numBlocks = + utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize); - // total size to encode numVecs - return bytesPerBlock * numBlocks; + // total size to encode numVecs + return bytesPerBlock * numBlocks; } - void RaftIVFFlat::addEncodedVectorsToList_( - idx_t listId, - const void* codes, - const idx_t* indices, - idx_t numVecs) { - auto stream = resources_->getDefaultStreamCurrentDevice(); - - // This list must already exist - FAISS_ASSERT(raft_knn_index.has_value()); - - // This list must currently be empty - FAISS_ASSERT(getListLength(listId) == 0); - - // If there's nothing to add, then there's nothing we have to do - if (numVecs == 0) { - return; - } - - // The GPU might have a different layout of the memory - auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); - auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); - - // We only have int32 length representations on the GPU per each - // list; the length is in sizeof(char) - FAISS_ASSERT(gpuListSizeInBytes <= - (size_t)std::numeric_limits::max()); - - std::vector interleaved_codes(gpuListSizeInBytes); - RaftIVFFlatCodePackerInterleaved packer((size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen()); - - packer.pack_all(reinterpret_cast(codes), interleaved_codes.data()); - - float* list_data_ptr; - const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - - /// fetch the list data ptr on host - raft::update_host(&list_data_ptr, raft_knn_index.value().data_ptrs().data_handle()+listId, 1, stream); + idx_t listId, + const void* codes, + const idx_t* indices, + idx_t numVecs) { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + // This list must already exist + FAISS_ASSERT(raft_knn_index.has_value()); + + // This list must currently be empty + FAISS_ASSERT(getListLength(listId) == 0); + + // If there's nothing to add, then there's nothing we have to do + if (numVecs == 0) { + return; + } + + // The GPU might have a different layout of the memory + auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); + auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); + + // We only have int32 length representations on the GPU per each + // list; the length is in sizeof(char) + FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); + + std::vector interleaved_codes(gpuListSizeInBytes); + RaftIVFFlatCodePackerInterleaved packer( + (size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen()); + + packer.pack_all( + reinterpret_cast(codes), interleaved_codes.data()); + + float* list_data_ptr; + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + /// fetch the list data ptr on host + raft::update_host( + &list_data_ptr, + raft_knn_index.value().data_ptrs().data_handle() + listId, + 1, + stream); + raft_handle.sync_stream(); + + raft::update_device( + reinterpret_cast(list_data_ptr), + interleaved_codes.data(), + gpuListSizeInBytes, + stream); raft_handle.sync_stream(); - - raft::update_device(reinterpret_cast(list_data_ptr), interleaved_codes.data(), gpuListSizeInBytes, stream); - raft_handle.sync_stream(); /// Handle the indices as well idx_t* list_indices_ptr; // fetch the list indices ptr on host - raft::update_host(&list_indices_ptr, raft_knn_index.value().inds_ptrs().data_handle()+listId, 1, stream); - raft_handle.sync_stream(); + raft::update_host( + &list_indices_ptr, + raft_knn_index.value().inds_ptrs().data_handle() + listId, + 1, + stream); + raft_handle.sync_stream(); raft::update_device(list_indices_ptr, indices, numVecs, stream); raft_handle.sync_stream(); } -RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chunk_size) { +RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved( + size_t list_size, + uint32_t dim, + uint32_t chunk_size) { this->dim = dim; this->chunk_size = chunk_size; // NB: dim should be divisible by the number of 4 byte records in one chunk FAISS_ASSERT(dim % chunk_size == 0); nvec = list_size; code_size = dim * 4; - block_size = utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize); + block_size = + utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize); } -void RaftIVFFlatCodePackerInterleaved::pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const { - // printf("packing offset %zu\n", offset); +void RaftIVFFlatCodePackerInterleaved::pack_1( + const uint8_t* flat_code, + size_t offset, + uint8_t* block) const { raft::neighbors::ivf_flat::codepacker::pack_1( - reinterpret_cast(flat_code), - reinterpret_cast(block), - dim, - chunk_size, - static_cast(offset)); + reinterpret_cast(flat_code), + reinterpret_cast(block), + dim, + chunk_size, + static_cast(offset)); } -void RaftIVFFlatCodePackerInterleaved::unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const { +void RaftIVFFlatCodePackerInterleaved::unpack_1( + const uint8_t* block, + size_t offset, + uint8_t* flat_code) const { raft::neighbors::ivf_flat::codepacker::unpack_1( - reinterpret_cast(block), - reinterpret_cast(flat_code), - dim, - chunk_size, - static_cast(offset)); + reinterpret_cast(block), + reinterpret_cast(flat_code), + dim, + chunk_size, + static_cast(offset)); } } // namespace gpu diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index e84fbd665c..1a79207a58 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -509,19 +509,19 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { // should not crash EXPECT_EQ(gpuIndex.ntotal, 0); - // gpuIndex.add(numNans, nans.data()); - - // std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - // std::vector distance(opt.numQuery * opt.k, 0); - // std::vector indices(opt.numQuery * opt.k, 0); - - // // should not crash - // gpuIndex.search( - // opt.numQuery, - // queryVecs.data(), - // opt.k, - // distance.data(), - // indices.data()); + gpuIndex.add(numNans, nans.data()); + + std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + std::vector distance(opt.numQuery * opt.k, 0); + std::vector indices(opt.numQuery * opt.k, 0); + + // should not crash + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + distance.data(), + indices.data()); } TEST(TestGpuIndexIVFFlat, UnifiedMemory) { From 82e979160ccefbebc9e53ad29ee06ad6cd99f3b6 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 16:53:49 -0700 Subject: [PATCH 68/87] cleanup --- faiss/gpu/GpuIndex.cu | 2 - faiss/gpu/impl/IVFFlat.cu | 4 - faiss/gpu/test/TestGpuIndexFlat.cpp | 171 ++++++++++++------------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 1 - 4 files changed, 85 insertions(+), 93 deletions(-) diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 53a0179334..89952b1121 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -102,8 +102,6 @@ size_t GpuIndex::getMinPagingSize() const { void GpuIndex::add(idx_t n, const float* x) { // Pass to add_with_ids - printf("add called with n = %d\n", n); - raft::print_host_vector("x", x, 5, std::cout); add_with_ids(n, x, nullptr); } diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index 9fb5603a73..09ffcef2ac 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -92,10 +92,6 @@ std::vector IVFFlat::translateCodesToGpu_( } bool sc = scalarQ_ ? true : false; - int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; - std::cout << "dim_=" << dim_ << ", scalarQ_=" << sc - << ", bitsPerCode=" << bitsPerCode - << ", interleavedLayout_=" << interleavedLayout_ << std::endl; auto up = unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index fd63af0589..2ab616caf4 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -663,93 +663,92 @@ TEST(TestRaftGpuIndexFlat, Reconstruct) { #endif void testSearchAndReconstruct(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - size_t dim = 32; - size_t nb = 5000; - size_t nq = 10; - int k = 10; - - auto xb = faiss::gpu::randVecs(nb, dim); - auto xq = faiss::gpu::randVecs(nq, dim); - - faiss::IndexFlatL2 cpuIndex(dim); - - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - - cpuIndex.add(nb, xb.data()); - gpuIndex.add(nb, xb.data()); - - std::vector refDistance(nq * k, 0); - std::vector refIndices(nq * k, -1); - std::vector refReconstruct(nq * k * dim, 0); - cpuIndex.search_and_reconstruct( - nq, - xq.data(), - k, - refDistance.data(), - refIndices.data(), - refReconstruct.data()); - - std::vector testDistance(nq * k, 0); - std::vector testIndices(nq * k, -1); - std::vector testReconstruct(nq * k * dim, 0); - gpuIndex.search_and_reconstruct( - nq, - xq.data(), - k, - testDistance.data(), - testIndices.data(), - testReconstruct.data()); - - // This handles the search results - faiss::gpu::compareLists( - refDistance.data(), - refIndices.data(), - testDistance.data(), - testIndices.data(), - nq, - k, - "SearchAndReconstruct", - true, - false, - true, - kF32MaxRelErr, - 0.1f, - 0.015f); - - // As the search results may be slightly different (though compareLists - // above will ensure a decent number of matches), reconstruction should be - // the same for the vectors that do match - for (int i = 0; i < nq; ++i) { - std::unordered_map refLocation; - - for (int j = 0; j < k; ++j) { - refLocation.insert(std::make_pair(refIndices[i * k + j], j)); - } - - for (int j = 0; j < k; ++j) { - auto idx = testIndices[i * k + j]; - auto it = refLocation.find(idx); - if (it != refLocation.end()) { - for (int d = 0; d < dim; ++d) { - EXPECT_EQ( - refReconstruct[(i * k + it->second) * dim + d], - testReconstruct[(i * k + j) * dim + d]); - } - } - } - } + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + size_t dim = 32; + size_t nb = 5000; + size_t nq = 10; + int k = 10; + + auto xb = faiss::gpu::randVecs(nb, dim); + auto xq = faiss::gpu::randVecs(nq, dim); + + faiss::IndexFlatL2 cpuIndex(dim); + + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + + cpuIndex.add(nb, xb.data()); + gpuIndex.add(nb, xb.data()); + + std::vector refDistance(nq * k, 0); + std::vector refIndices(nq * k, -1); + std::vector refReconstruct(nq * k * dim, 0); + cpuIndex.search_and_reconstruct( + nq, + xq.data(), + k, + refDistance.data(), + refIndices.data(), + refReconstruct.data()); + + std::vector testDistance(nq * k, 0); + std::vector testIndices(nq * k, -1); + std::vector testReconstruct(nq * k * dim, 0); + gpuIndex.search_and_reconstruct( + nq, + xq.data(), + k, + testDistance.data(), + testIndices.data(), + testReconstruct.data()); + + // This handles the search results + faiss::gpu::compareLists( + refDistance.data(), + refIndices.data(), + testDistance.data(), + testIndices.data(), + nq, + k, + "SearchAndReconstruct", + true, + false, + true, + kF32MaxRelErr, + 0.1f, + 0.015f); + + // As the search results may be slightly different (though compareLists + // above will ensure a decent number of matches), reconstruction should be + // the same for the vectors that do match + for (int i = 0; i < nq; ++i) { + std::unordered_map refLocation; + + for (int j = 0; j < k; ++j) { + refLocation.insert(std::make_pair(refIndices[i * k + j], j)); + } + + for (int j = 0; j < k; ++j) { + auto idx = testIndices[i * k + j]; + auto it = refLocation.find(idx); + if (it != refLocation.end()) { + for (int d = 0; d < dim; ++d) { + EXPECT_EQ( + refReconstruct[(i * k + it->second) * dim + d], + testReconstruct[(i * k + j) * dim + d]); + } + } + } + } } - TEST(TestGpuIndexFlat, SearchAndReconstruct) { testSearchAndReconstruct(false); } diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 1a79207a58..2f2fd87cd1 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -232,7 +232,6 @@ void copyToTest(bool useFloat16CoarseQuantizer) { compFloat16 ? 0.30f : 0.015f); } - void copyFromTest(bool useFloat16CoarseQuantizer) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); From 8486b9b1d45cd1920e1e15d560fd7d60ac8fe3c3 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:06:29 -0700 Subject: [PATCH 69/87] cleanup --- faiss/gpu/GpuIndexIVFFlat.cu | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 3458177dd3..3a67d2240c 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -11,13 +11,13 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include #include @@ -107,7 +107,6 @@ void GpuIndexIVFFlat::set_index_( IndicesOptions indicesOptions, MemorySpace space) { if (config_.use_raft) { - printf("Setting RaftIVFFlat index\n"); index_.reset(new RaftIVFFlat( resources, dim, @@ -237,7 +236,6 @@ void GpuIndexIVFFlat::updateQuantizer() { } void GpuIndexIVFFlat::train(idx_t n, const float* x) { - printf("Inside train"); DeviceScope scope(config_.device); // just in case someone changed our quantizer @@ -254,12 +252,12 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { // First, make sure that the data is resident on the CPU, if it is not on // the CPU, as we depend upon parts of the CPU code if (!config_.use_raft) { - auto hostData = toHost( - (float*)x, - resources_->getDefaultStream(config_.device), - {n, this->d}); + auto hostData = toHost( + (float*)x, + resources_->getDefaultStream(config_.device), + {n, this->d}); - trainQuantizer_(n, hostData.data()); + trainQuantizer_(n, hostData.data()); } // The quantizer is now trained; construct the IVF index @@ -274,7 +272,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); - + if (!config_.use_raft && reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } @@ -291,10 +289,11 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { raft_idx_params.kmeans_n_iters = cp.niter; raft_idx_params.adaptive_centers = !cp.frozen_centroids; - printf("raft_idx_params.k_means_n_iters %u\n", cp.niter); - - std::dynamic_pointer_cast(index_)->set_index_(std::make_optional>(raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (idx_t)d))); + std::dynamic_pointer_cast(index_)->set_index_( + std::make_optional< + raft::neighbors::ivf_flat::index>( + raft::neighbors::ivf_flat::build( + raft_handle, raft_idx_params, x, n, (idx_t)d))); } this->is_trained = true; From 38215bc9cbe166337367ffddf51c997260f42082 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:15:03 -0700 Subject: [PATCH 70/87] cleanup --- faiss/gpu/impl/IVFBase.cu | 1 - faiss/gpu/impl/IVFFlat.cu | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu index 1e2f414fc0..890d489440 100644 --- a/faiss/gpu/impl/IVFBase.cu +++ b/faiss/gpu/impl/IVFBase.cu @@ -323,7 +323,6 @@ std::vector IVFBase::getListVectorData(idx_t listId, bool gpuFormat) } void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) { - printf("inside ivf-flat's copyInvertedListsFrom\n"); idx_t nlist = ivf ? ivf->nlist : 0; for (idx_t i = 0; i < nlist; ++i) { addEncodedVectorsToList_( diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index 09ffcef2ac..376b98ef06 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -126,6 +126,7 @@ void IVFFlat::appendVectors_( // // Append the new encodings // + // Append indices to the IVF lists runIVFIndicesAppend( listIds, From ac678974a7059da7f7cdb6252c43507c747ea48b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:19:39 -0700 Subject: [PATCH 71/87] cleanup --- faiss/gpu/impl/IVFFlat.cu | 8 -------- 1 file changed, 8 deletions(-) diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index 376b98ef06..ac6f155aeb 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -91,8 +91,6 @@ std::vector IVFFlat::translateCodesToGpu_( return codes; } - bool sc = scalarQ_ ? true : false; - auto up = unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); @@ -286,15 +284,9 @@ void IVFFlat::searchPreassigned( void IVFFlat::searchImpl_( Tensor& queries, - /** - * - */ Tensor& coarseDistances, Tensor& coarseIndices, - /** - * This is raft::neighbors::ivf_flat::index::centers_ - */ Tensor& ivfCentroids, int k, Tensor& outDistances, From 94817aa6e2f1382ff674a45b784f91f3764f6fd4 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:23:57 -0700 Subject: [PATCH 72/87] cleanup --- faiss/gpu/impl/IVFFlat.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index ac6f155aeb..ac06fd0156 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -58,7 +58,7 @@ size_t IVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { idx_t bits = scalarQ_ ? scalarQ_->bits : 32 /* float */; // bytes to encode a block of 32 vectors (single dimension) - idx_t bytesPerDimBlock = bits * 32 / 8; // = 128 if bits == 32 + idx_t bytesPerDimBlock = bits * 32 / 8; // bytes to fully encode 32 vectors idx_t bytesPerBlock = bytesPerDimBlock * dim_; From 91b1e32e64c62cecde88db5208cf6c1e6841a1ba Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:26:17 -0700 Subject: [PATCH 73/87] cleanup --- faiss/gpu/impl/IVFFlat.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu index ac06fd0156..4607e49870 100644 --- a/faiss/gpu/impl/IVFFlat.cu +++ b/faiss/gpu/impl/IVFFlat.cu @@ -91,6 +91,8 @@ std::vector IVFFlat::translateCodesToGpu_( return codes; } + int bitsPerCode = scalarQ_ ? scalarQ_->bits : 32; + auto up = unpackNonInterleaved(std::move(codes), numVecs, dim_, bitsPerCode); return packInterleaved(std::move(up), numVecs, dim_, bitsPerCode); @@ -283,10 +285,8 @@ void IVFFlat::searchPreassigned( void IVFFlat::searchImpl_( Tensor& queries, - Tensor& coarseDistances, Tensor& coarseIndices, - Tensor& ivfCentroids, int k, Tensor& outDistances, From 613ca7a0e71165ed28047e12c1a1b3f15d9c238c Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 8 Aug 2023 17:48:08 -0700 Subject: [PATCH 74/87] cleanup --- faiss/gpu/GpuIndexIVFFlat.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 3a67d2240c..495fdc1dbf 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -146,7 +146,6 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - printf("Inside copyFrom\n"); DeviceScope scope(config_.device); // This will copy GpuIndexIVF data such as the coarse quantizer From c43c83f92df5047beb40b4b6e45d4123a508e212 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 9 Aug 2023 13:59:28 -0700 Subject: [PATCH 75/87] Separate out nan filtering --- faiss/gpu/impl/RaftIVFFlat.cu | 140 ++++++++++++++++----------------- faiss/gpu/impl/RaftIVFFlat.cuh | 2 + 2 files changed, 69 insertions(+), 73 deletions(-) diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 9a08fb7d51..63d4936743 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -90,13 +90,14 @@ void RaftIVFFlat::search( Tensor& outIndices) { // TODO: We probably don't want to ignore the coarse quantizer here... - uint32_t n = queries.getSize(0); + uint32_t numQueries = queries.getSize(0); uint32_t cols = queries.getSize(1); uint32_t k_ = k; // Device is already set in GpuIndex::search FAISS_ASSERT(raft_knn_index.has_value()); - FAISS_ASSERT(n > 0); + FAISS_ASSERT(numQueries > 0); + FAISS_ASSERT(cols == dim_); FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); const raft::device_resources& raft_handle = @@ -104,35 +105,13 @@ void RaftIVFFlat::search( raft::neighbors::ivf_flat::search_params pams; pams.n_probes = nprobe; - uint32_t n_rows = n; - - auto nan_flag = raft::make_device_vector(raft_handle, n_rows); - - thrust::fill_n( - raft_handle.get_thrust_policy(), - nan_flag.data_handle(), - n_rows, - true); - raft::linalg::map_offset( - raft_handle, - nan_flag.view(), - [queries = queries.data(), dim_ = this->dim_] __device__(idx_t i) { - for (idx_t col = 0; col < dim_; col++) { - if (!isfinite(queries[i * dim_ + col])) { - return false; - } - } - return true; - }); - - // TODO: We probably don't want to ignore the coarse quantizer here - auto queries_view = raft::make_device_matrix_view( - queries.data(), n_rows, cols); - auto out_inds_view = - raft::make_device_matrix_view(outIndices.data(), n_rows, k_); + queries.data(), numQueries, cols); + auto out_inds_view = raft::make_device_matrix_view( + outIndices.data(), numQueries, k_); auto out_dists_view = raft::make_device_matrix_view( - outDistances.data(), n_rows, k_); + outDistances.data(), numQueries, k_); + raft::neighbors::ivf_flat::search( raft_handle, pams, @@ -140,10 +119,15 @@ void RaftIVFFlat::search( queries_view, out_inds_view, out_dists_view); - float max_val = std::numeric_limits::max(); + + /// Identify NaN rows and mask their nearest neighbors + auto nan_flag = raft::make_device_vector(raft_handle, numQueries); + + validRowIndices_(queries, nan_flag.data_handle()); + raft::linalg::map_offset( raft_handle, - raft::make_device_vector_view(outIndices.data(), n_rows * k_), + raft::make_device_vector_view(outIndices.data(), numQueries * k_), [nan_flag = nan_flag.data_handle(), out_inds = outIndices.data(), k_] __device__(uint32_t i) { @@ -152,9 +136,11 @@ void RaftIVFFlat::search( return idx_t(-1); return out_inds[i]; }); + + float max_val = std::numeric_limits::max(); raft::linalg::map_offset( raft_handle, - raft::make_device_vector_view(outDistances.data(), n_rows * k_), + raft::make_device_vector_view(outDistances.data(), numQueries * k_), [nan_flag = nan_flag.data_handle(), out_dists = outDistances.data(), max_val, @@ -164,8 +150,6 @@ void RaftIVFFlat::search( return max_val; return out_dists[i]; }); - - raft_handle.sync_stream(); } /// Classify and encode/add vectors to our IVF lists. @@ -184,57 +168,45 @@ idx_t RaftIVFFlat::addVectors( /// Remove NaN values auto nan_flag = raft::make_device_vector(raft_handle, n_rows); - thrust::fill_n( - raft_handle.get_thrust_policy(), - nan_flag.data_handle(), - n_rows, - true); - raft::linalg::map_offset( - raft_handle, - nan_flag.view(), - [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) { - for (idx_t col = 0; col < dim_; col++) { - if (!isfinite(vecs[i * dim_ + col])) { - return false; - } - } - return true; - }); - raft_handle.sync_stream(); + validRowIndices_(vecs, nan_flag.data_handle()); + idx_t n_rows_valid = thrust::reduce( raft_handle.get_thrust_policy(), nan_flag.data_handle(), nan_flag.data_handle() + n_rows, 0); - auto gather_indices = - raft::make_device_vector(raft_handle, n_rows_valid); - auto count = thrust::make_counting_iterator(0); - thrust::copy_if( - raft_handle.get_thrust_policy(), - count, - count + n_rows, - gather_indices.data_handle(), - [nan_flag = nan_flag.data_handle()] __device__(auto i) { - return nan_flag[i]; - }); + if (n_rows_valid < n_rows) { + auto gather_indices = raft::make_device_vector( + raft_handle, n_rows_valid); + + auto count = thrust::make_counting_iterator(0); + + thrust::copy_if( + raft_handle.get_thrust_policy(), + count, + count + n_rows, + gather_indices.data_handle(), + [nan_flag = nan_flag.data_handle()] __device__(auto i) { + return nan_flag[i]; + }); + raft::matrix::gather( raft_handle, raft::make_device_matrix_view( vecs.data(), n_rows, dim_), raft::make_const_mdspan(gather_indices.view()), (idx_t)16); - } - auto valid_indices = - raft::make_device_vector(raft_handle, n_rows); - raft::matrix::gather( - raft_handle, - raft::make_device_matrix_view( - indices.data(), n_rows, (idx_t)1), - raft::make_const_mdspan(gather_indices.view()), - raft::make_device_matrix_view( - valid_indices.data_handle(), n_rows_valid, (idx_t)1)); + auto valid_indices = raft::make_device_vector( + raft_handle, n_rows_valid); + + raft::matrix::gather( + raft_handle, + raft::make_device_matrix_view( + indices.data(), n_rows, (idx_t)1), + raft::make_const_mdspan(gather_indices.view())); + } /// TODO: We probably don't want to ignore the coarse quantizer here @@ -244,7 +216,8 @@ idx_t RaftIVFFlat::addVectors( raft::make_device_matrix_view( vecs.data(), n_rows_valid, dim_), std::make_optional>( - valid_indices.view()), + raft::make_device_vector_view( + indices.data(), n_rows_valid)), raft_knn_index.value())); return n_rows_valid; @@ -551,6 +524,27 @@ void RaftIVFFlat::addEncodedVectorsToList_( raft_handle.sync_stream(); } +void RaftIVFFlat::validRowIndices_( + Tensor& vecs, + bool* nan_flag) { + raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + idx_t n_rows = vecs.getSize(0); + + thrust::fill_n(raft_handle.get_thrust_policy(), nan_flag, n_rows, true); + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(nan_flag, n_rows), + [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) { + for (idx_t col = 0; col < dim_; col++) { + if (!isfinite(vecs[i * dim_ + col])) { + return false; + } + } + return true; + }); +} + RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved( size_t list_size, uint32_t dim, diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 30a7378570..1cd2e18307 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -108,6 +108,8 @@ class RaftIVFFlat : public IVFFlat { void copyInvertedListsFrom(const InvertedLists* ivf) override; void set_index_(std::optional> idx); + + void validRowIndices_(Tensor& vecs, bool* nan_flag); protected: /// Adds a set of codes and indices to a list, with the representation From 7eb5209d8c122273e1ed0dbae18fc90965aef364 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 9 Aug 2023 16:38:35 -0700 Subject: [PATCH 76/87] Add USE_NVIDIA_RAFT --- faiss/gpu/GpuIndexIVFFlat.cu | 30 +- faiss/gpu/StandardGpuResources.cpp | 2 + faiss/gpu/impl/IVFFlat.cuh | 5 - faiss/gpu/impl/RaftIVFFlat.cu | 10 +- faiss/gpu/impl/RaftIVFFlat.cuh | 34 +- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 485 ++++++++++++++----------- 6 files changed, 322 insertions(+), 244 deletions(-) diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 495fdc1dbf..9422f6bc56 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -11,17 +11,18 @@ #include #include #include -#include #include -#include #include #include +#if defined USE_NVIDIA_RAFT +#include #include #include #include #include #include +#endif #include @@ -106,6 +107,8 @@ void GpuIndexIVFFlat::set_index_( bool interleavedLayout, IndicesOptions indicesOptions, MemorySpace space) { +#if defined USE_NVIDIA_RAFT + if (config_.use_raft) { index_.reset(new RaftIVFFlat( resources, @@ -118,7 +121,14 @@ void GpuIndexIVFFlat::set_index_( interleavedLayout, indicesOptions, space)); - } else { + } else +#else + if (config_.use_raft) { + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); + } else +#endif + { index_.reset(new IVFFlat( resources, dim, @@ -151,10 +161,14 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // This will copy GpuIndexIVF data such as the coarse quantizer GpuIndexIVF::copyFrom(index); + printf("GpuIndexIVFcopyFrom done\n"); + // Clear out our old data index_.reset(); baseIndex_.reset(); + printf("indices reset\n"); + // The other index might not be trained if (!index->is_trained) { FAISS_ASSERT(!is_trained); @@ -177,7 +191,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.indicesOptions, config_.memorySpace); - // Copy all of the IVF data + // Copy all of the IVF data printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); index_->copyInvertedListsFrom(index->invlists); } @@ -276,6 +290,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { index_->reserveMemory(reserveMemoryVecs_); } +#if defined USE_NVIDIA_RAFT + if (config_.use_raft) { const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); @@ -294,6 +310,12 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { raft::neighbors::ivf_flat::build( raft_handle, raft_idx_params, x, n, (idx_t)d))); } +#else + if (config_.use_raft) { + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); + } +#endif this->is_trained = true; } diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index e9ad2e62fc..4e8701ab03 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -361,7 +361,9 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { defaultStreams_[device] = defaultStream; +#if defined USE_NVIDIA_RAFT raftHandles_.emplace(std::make_pair(device, defaultStream)); +#endif cudaStream_t asyncCopyStream = 0; CUDA_VERIFY( diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh index 726d62c1da..246fc18b16 100644 --- a/faiss/gpu/impl/IVFFlat.cuh +++ b/faiss/gpu/impl/IVFFlat.cuh @@ -7,11 +7,6 @@ #pragma once -#if defined USE_NVIDIA_RAFT -#include -#include -#endif - #include #include diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 63d4936743..b754ee8876 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -19,12 +19,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include #include #include #include -#include #include #include @@ -47,10 +44,11 @@ #include #include +#include +#include +#include +#include #include -#include - -#include namespace faiss { namespace gpu { diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 1cd2e18307..a9c013f68a 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -22,12 +22,11 @@ #pragma once -#include #include #include -#include #include +#include #include @@ -85,7 +84,7 @@ class RaftIVFFlat : public IVFFlat { Tensor& indices) override; /// Reserve GPU memory in our inverted lists for this number of vectors -// void reserveMemory(idx_t numVecs) override; + // void reserveMemory(idx_t numVecs) override; /// Clear out all inverted lists, but retain the coarse quantizer /// and the product quantizer info @@ -107,22 +106,25 @@ class RaftIVFFlat : public IVFFlat { /// Copy all inverted lists from a CPU representation to ourselves void copyInvertedListsFrom(const InvertedLists* ivf) override; - void set_index_(std::optional> idx); + /// Update the raft index + void set_index_( + std::optional> idx); + /// Filter out matrix rows containing NaN values void validRowIndices_(Tensor& vecs, bool* nan_flag); - + protected: - /// Adds a set of codes and indices to a list, with the representation - /// coming from the CPU equivalent - void addEncodedVectorsToList_( + /// Adds a set of codes and indices to a list, with the representation + /// coming from the CPU equivalent + void addEncodedVectorsToList_( idx_t listId, // resident on the host const void* codes, // resident on the host const idx_t* indices, idx_t numVecs) override; - - /// Returns the number of bytes in which an IVF list containing numVecs + + /// Returns the number of bytes in which an IVF list containing numVecs /// vectors is encoded on the device. Note that due to padding this is not /// the same as the encoding size for a subset of vectors in an IVF list; /// this is the size for an entire IVF list @@ -132,17 +134,19 @@ class RaftIVFFlat : public IVFFlat { raft_knn_index{std::nullopt}; }; - struct RaftIVFFlatCodePackerInterleaved : CodePacker { - RaftIVFFlatCodePackerInterleaved(size_t list_size, uint32_t dim, uint32_t chuk_size); + RaftIVFFlatCodePackerInterleaved( + size_t list_size, + uint32_t dim, + uint32_t chuk_size); void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block) const final; void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code) const final; - protected: - uint32_t chunk_size; - uint32_t dim; + protected: + uint32_t chunk_size; + uint32_t dim; }; } // namespace gpu diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 2f2fd87cd1..8c092be7cb 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -35,7 +35,6 @@ constexpr float kF16MaxRelErr = 0.3f; constexpr float kF32MaxRelErr = 0.03f; - struct Options { Options() { numAdd = 2 * faiss::gpu::randVal(2000, 5000); @@ -56,6 +55,11 @@ struct Options { faiss::gpu::INDICES_64_BIT}); device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + raftOpt.push_back(false); +#if defined USE_NVIDIA_RAFT + raftOpt.push_back(true); +#endif } std::string toString() const { @@ -77,6 +81,7 @@ struct Options { int k; int device; faiss::gpu::IndicesOptions indicesOpt; + std::vector raftOpt; }; void queryTest( @@ -106,28 +111,35 @@ void queryTest( faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; - config.use_raft = true; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = opt.nprobe; - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.65f : 0.015f); + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, + cpuIndex.d, + cpuIndex.nlist, + cpuIndex.metric_type, + config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = opt.nprobe; + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.65f : 0.015f); + } } } @@ -157,27 +169,39 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = true; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = opt.nprobe; - cpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); + for (int i = 0; i < opt.raftOpt.size(); i++) { + printf("i %d\n", i); + config.use_raft = opt.raftOpt[i]; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, + cpuIndex.d, + cpuIndex.nlist, + cpuIndex.metric_type, + config); + printf("gpuindex created"); + gpuIndex.copyFrom(&cpuIndex); + printf("copyfrom done"); + gpuIndex.nprobe = opt.nprobe; + + cpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + printf("gpu vectors added"); + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); + printf("indices compared"); + } } } @@ -194,42 +218,46 @@ void copyToTest(bool useFloat16CoarseQuantizer) { config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.nprobe = opt.nprobe; + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; - // use garbage values to see if we overwrite then - faiss::IndexFlatL2 cpuQuantizer(1); - faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); - cpuIndex.nprobe = 1; + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.nprobe = opt.nprobe; + + // use garbage values to see if we overwrite then + faiss::IndexFlatL2 cpuQuantizer(1); + faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); + cpuIndex.nprobe = 1; + + gpuIndex.copyTo(&cpuIndex); - gpuIndex.copyTo(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); + } } void copyFromTest(bool useFloat16CoarseQuantizer) { @@ -252,35 +280,39 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = true; - - faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); - gpuIndex.nprobe = 1; - - gpuIndex.copyFrom(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); + + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, 1, 1, faiss::METRIC_L2, config); + gpuIndex.nprobe = 1; + + gpuIndex.copyFrom(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); + } } TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) { @@ -399,36 +431,39 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; - config.use_raft = true; - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = opt.nprobe; + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; - // Construct a positive test set - auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = opt.nprobe; - // Put all vecs on positive size - for (auto& f : queryVecs) { - f = std::abs(f); - } + // Construct a positive test set + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - bool compFloat16 = false; - faiss::gpu::compareIndices( - queryVecs, - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.99f : 0.1f, - compFloat16 ? 0.65f : 0.015f); + // Put all vecs on positive size + for (auto& f : queryVecs) { + f = std::abs(f); + } + + bool compFloat16 = false; + faiss::gpu::compareIndices( + queryVecs, + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.99f : 0.1f, + compFloat16 ? 0.65f : 0.015f); + } } // @@ -448,31 +483,34 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); - config.use_raft = true; - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.nprobe = opt.nprobe; + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.nprobe = opt.nprobe; - int numQuery = 10; - std::vector nans( - numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + + int numQuery = 10; + std::vector nans( + numQuery * opt.dim, std::numeric_limits::quiet_NaN()); - std::vector distances(numQuery * opt.k, 0); - std::vector indices(numQuery * opt.k, 0); + std::vector distances(numQuery * opt.k, 0); + std::vector indices(numQuery * opt.k, 0); - gpuIndex.search( - numQuery, nans.data(), opt.k, distances.data(), indices.data()); + gpuIndex.search( + numQuery, nans.data(), opt.k, distances.data(), indices.data()); - for (int q = 0; q < numQuery; ++q) { - for (int k = 0; k < opt.k; ++k) { - EXPECT_EQ(indices[q * opt.k + k], -1); - EXPECT_EQ( - distances[q * opt.k + k], - std::numeric_limits::max()); + for (int q = 0; q < numQuery; ++q) { + for (int k = 0; k < opt.k; ++k) { + EXPECT_EQ(indices[q * opt.k + k], -1); + EXPECT_EQ( + distances[q * opt.k + k], + std::numeric_limits::max()); + } } } } @@ -487,40 +525,45 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); - config.use_raft = true; - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.nprobe = opt.nprobe; + for (int i = 0; i < opt.raftOpt.size(); i++) { + config.use_raft = opt.raftOpt[i]; - int numNans = 10; - std::vector nans( - numNans * opt.dim, std::numeric_limits::quiet_NaN()); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.nprobe = opt.nprobe; - // Make one vector valid (not the first vector, in order to test offset - // issues), which should actually add - for (int i = 0; i < opt.dim; ++i) { - nans[opt.dim + i] = i; - } + int numNans = 10; + std::vector nans( + numNans * opt.dim, std::numeric_limits::quiet_NaN()); - std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); - gpuIndex.train(opt.numTrain, trainVecs.data()); - - // should not crash - EXPECT_EQ(gpuIndex.ntotal, 0); - gpuIndex.add(numNans, nans.data()); - - std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - std::vector distance(opt.numQuery * opt.k, 0); - std::vector indices(opt.numQuery * opt.k, 0); - - // should not crash - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - distance.data(), - indices.data()); + // Make one vector valid (not the first vector, in order to test offset + // issues), which should actually add + for (int i = 0; i < opt.dim; ++i) { + nans[opt.dim + i] = i; + } + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + gpuIndex.train(opt.numTrain, trainVecs.data()); + + // should not crash + EXPECT_EQ(gpuIndex.ntotal, 0); + gpuIndex.add(numNans, nans.data()); + + std::vector queryVecs = + faiss::gpu::randVecs(opt.numQuery, opt.dim); + std::vector distance(opt.numQuery * opt.k, 0); + std::vector indices(opt.numQuery * opt.k, 0); + + // should not crash + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + distance.data(), + indices.data()); + } } TEST(TestGpuIndexIVFFlat, UnifiedMemory) { @@ -543,6 +586,11 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { int numQuery = 10; int k = 10; int nprobe = 8; + std::vector raftOpt; + raftOpt.push_back(false); + #if defined USE_NVIDIA_RAFT + raftOpt.push_back(true); + #endif std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); @@ -561,23 +609,25 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; config.memorySpace = faiss::gpu::MemorySpace::Unified; - config.use_raft = true; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = nprobe; - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); + for (int i = 0; i < raftOpt.size(); i++) { + config.use_raft = raftOpt[i]; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = nprobe; + + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); + } } TEST(TestGpuIndexIVFFlat, LongIVFList) { @@ -602,6 +652,11 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { size_t numTrain = 100; int numQuery = 5; int k = 10; + std::vector raftOpt; + raftOpt.push_back(false); + #if defined USE_NVIDIA_RAFT + raftOpt.push_back(true); + #endif std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); @@ -619,24 +674,26 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; - config.use_raft = true; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(numTrain, trainVecs.data()); - gpuIndex.add(numAdd, addVecs.data()); - gpuIndex.nprobe = 1; - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); + for (int i = 0; i < raftOpt.size(); i++) { + config.use_raft = raftOpt[i]; + + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(numTrain, trainVecs.data()); + gpuIndex.add(numAdd, addVecs.data()); + gpuIndex.nprobe = 1; + + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); + } } int main(int argc, char** argv) { From db1774b7eeac7c63b8b08ef1a19123a0c3f6228a Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 14 Aug 2023 06:29:42 -0700 Subject: [PATCH 77/87] Update test --- faiss/gpu/GpuIndex.cu | 1 + faiss/gpu/GpuIndexIVF.cu | 1 + faiss/gpu/GpuIndexIVFFlat.cu | 11 +- faiss/gpu/impl/IVFBase.cu | 2 + faiss/gpu/impl/RaftIVFFlat.cu | 63 +-- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 582 +++++++++++++------------ 6 files changed, 344 insertions(+), 316 deletions(-) diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 89952b1121..749bec221f 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -77,6 +77,7 @@ int GpuIndex::getDevice() const { } void GpuIndex::copyFrom(const faiss::Index* index) { + printf("inside gpuindex copyFrom\n"); d = index->d; metric_type = index->metric_type; metric_arg = index->metric_arg; diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index f2ed323605..935c255b8f 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -212,6 +212,7 @@ void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { index->nprobe = nprobe; FAISS_ASSERT(quantizer); + printf("index -> own_fields %d\n", index->own_fields); if (index->own_fields) { delete index->quantizer; index->quantizer = nullptr; diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 9422f6bc56..3a18aa0e90 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -110,6 +110,7 @@ void GpuIndexIVFFlat::set_index_( #if defined USE_NVIDIA_RAFT if (config_.use_raft) { + printf("inside GpuIndexIVFFlat's set_index_ use_raft = true\n"); index_.reset(new RaftIVFFlat( resources, dim, @@ -121,6 +122,7 @@ void GpuIndexIVFFlat::set_index_( interleavedLayout, indicesOptions, space)); + own_fields = false; } else #else if (config_.use_raft) { @@ -128,7 +130,8 @@ void GpuIndexIVFFlat::set_index_( "RAFT has not been compiled into the current version so it cannot be used."); } else #endif - { + { + printf("inside GpuIndexIVFFlat's set_index_ use_raft = false\n"); index_.reset(new IVFFlat( resources, dim, @@ -141,7 +144,6 @@ void GpuIndexIVFFlat::set_index_( indicesOptions, space)); } - baseIndex_ = std::static_pointer_cast(index_); updateQuantizer(); } @@ -156,12 +158,13 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { + printf("Inside GpuIndexIVFFlat's copyFrom\n"); DeviceScope scope(config_.device); // This will copy GpuIndexIVF data such as the coarse quantizer GpuIndexIVF::copyFrom(index); - printf("GpuIndexIVFcopyFrom done\n"); + printf("GpuIndexIVF's copyFrom done\n"); // Clear out our old data index_.reset(); @@ -192,7 +195,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { config_.memorySpace); // Copy all of the IVF data - printf("Copying inverted lists from cpu index to FAISS gpu index flat\n"); + printf("Copying inverted lists from cpu index to FAISS gpu index ivfflat\n"); index_->copyInvertedListsFrom(index->invlists); } diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu index 890d489440..6aef83ef3f 100644 --- a/faiss/gpu/impl/IVFBase.cu +++ b/faiss/gpu/impl/IVFBase.cu @@ -106,6 +106,7 @@ void IVFBase::reserveMemory(idx_t numVecs) { } void IVFBase::reset() { + printf("inside ivfbase::reset\n"); auto stream = resources_->getDefaultStreamCurrentDevice(); deviceListData_.clear(); @@ -323,6 +324,7 @@ std::vector IVFBase::getListVectorData(idx_t listId, bool gpuFormat) } void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) { + printf("Inside IVFBase's copyInvertedListsFrom\n"); idx_t nlist = ivf ? ivf->nlist : 0; for (idx_t i = 0; i < nlist; ++i) { addEncodedVectorsToList_( diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index b754ee8876..110a0f0ced 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -73,7 +73,7 @@ RaftIVFFlat::RaftIVFFlat( scalarQ, interleavedLayout, indicesOptions, - space) {} + space) {printf("RaftIVFFlat constructor called\n"); reset();} RaftIVFFlat::~RaftIVFFlat() {} @@ -121,33 +121,33 @@ void RaftIVFFlat::search( /// Identify NaN rows and mask their nearest neighbors auto nan_flag = raft::make_device_vector(raft_handle, numQueries); - validRowIndices_(queries, nan_flag.data_handle()); - - raft::linalg::map_offset( - raft_handle, - raft::make_device_vector_view(outIndices.data(), numQueries * k_), - [nan_flag = nan_flag.data_handle(), - out_inds = outIndices.data(), - k_] __device__(uint32_t i) { - uint32_t row = i / k_; - if (!nan_flag[row]) - return idx_t(-1); - return out_inds[i]; - }); - - float max_val = std::numeric_limits::max(); - raft::linalg::map_offset( - raft_handle, - raft::make_device_vector_view(outDistances.data(), numQueries * k_), - [nan_flag = nan_flag.data_handle(), - out_dists = outDistances.data(), - max_val, - k_] __device__(uint32_t i) { - uint32_t row = i / k_; - if (!nan_flag[row]) - return max_val; - return out_dists[i]; - }); +// validRowIndices_(queries, nan_flag.data_handle()); + +// raft::linalg::map_offset( +// raft_handle, +// raft::make_device_vector_view(outIndices.data(), numQueries * k_), +// [nan_flag = nan_flag.data_handle(), +// out_inds = outIndices.data(), +// k_] __device__(uint32_t i) { +// uint32_t row = i / k_; +// if (!nan_flag[row]) +// return idx_t(-1); +// return out_inds[i]; +// }); + +// float max_val = std::numeric_limits::max(); +// raft::linalg::map_offset( +// raft_handle, +// raft::make_device_vector_view(outDistances.data(), numQueries * k_), +// [nan_flag = nan_flag.data_handle(), +// out_dists = outDistances.data(), +// max_val, +// k_] __device__(uint32_t i) { +// uint32_t row = i / k_; +// if (!nan_flag[row]) +// return max_val; +// return out_dists[i]; +// }); } /// Classify and encode/add vectors to our IVF lists. @@ -175,6 +175,7 @@ idx_t RaftIVFFlat::addVectors( 0); if (n_rows_valid < n_rows) { + printf("NaN values found"); auto gather_indices = raft::make_device_vector( raft_handle, n_rows_valid); @@ -270,6 +271,10 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { std::vector RaftIVFFlat::getListVectorData( idx_t listId, bool gpuFormat) const { + if (gpuFormat) { + FAISS_THROW_MSG("gpuFormat is not suppported for raft indices"); + } + printf("inside getlistvectordata of raft"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = @@ -466,6 +471,8 @@ void RaftIVFFlat::addEncodedVectorsToList_( // This list must already exist FAISS_ASSERT(raft_knn_index.has_value()); + printf("getListLength(listId), %d\n", getListLength(listId)); + // This list must currently be empty FAISS_ASSERT(getListLength(listId) == 0); diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 8c092be7cb..e3b9540d46 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -42,7 +42,7 @@ struct Options { numCentroids = std::sqrt((float)numAdd / 2); numTrain = numCentroids * 40; - nprobe = faiss::gpu::randVal(std::min(50, numCentroids), numCentroids); + nprobe = faiss::gpu::randVal(std::min(10, numCentroids), numCentroids); numQuery = faiss::gpu::randVal(32, 100); // Due to the approximate nature of the query and of floating point @@ -56,10 +56,7 @@ struct Options { device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - raftOpt.push_back(false); -#if defined USE_NVIDIA_RAFT - raftOpt.push_back(true); -#endif + use_raft = false; } std::string toString() const { @@ -67,7 +64,7 @@ struct Options { str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " << dim << " numCentroids " << numCentroids << " nprobe " << nprobe << " numQuery " << numQuery << " k " << k << " indicesOpt " - << indicesOpt; + << indicesOpt <<" use_raft "<< use_raft; return str.str(); } @@ -81,7 +78,7 @@ struct Options { int k; int device; faiss::gpu::IndicesOptions indicesOpt; - std::vector raftOpt; + bool use_raft; }; void queryTest( @@ -112,38 +109,31 @@ void queryTest( config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + config.use_raft = opt.use_raft; - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, - cpuIndex.d, - cpuIndex.nlist, - cpuIndex.metric_type, - config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = opt.nprobe; - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.65f : 0.015f); - } + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = opt.nprobe; + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.65f : 0.015f); } } -void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { +void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer, bool use_raft) { for (int tries = 0; tries < 2; ++tries) { Options opt; @@ -169,43 +159,31 @@ void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + config.use_raft = use_raft; - for (int i = 0; i < opt.raftOpt.size(); i++) { - printf("i %d\n", i); - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, - cpuIndex.d, - cpuIndex.nlist, - cpuIndex.metric_type, - config); - printf("gpuindex created"); - gpuIndex.copyFrom(&cpuIndex); - printf("copyfrom done"); - gpuIndex.nprobe = opt.nprobe; - - cpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - printf("gpu vectors added"); - - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); - printf("indices compared"); - } + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = opt.nprobe; + + cpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); } } -void copyToTest(bool useFloat16CoarseQuantizer) { +void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -217,50 +195,47 @@ void copyToTest(bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; + config.use_raft = use_raft; - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); - gpuIndex.nprobe = opt.nprobe; - - // use garbage values to see if we overwrite then - faiss::IndexFlatL2 cpuQuantizer(1); - faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); - cpuIndex.nprobe = 1; - - gpuIndex.copyTo(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.nprobe = opt.nprobe; - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); - - testIVFEquality(cpuIndex, gpuIndex); + // use garbage values to see if we overwrite then + faiss::IndexFlatL2 cpuQuantizer(1); + faiss::IndexIVFFlat cpuIndex(&cpuQuantizer, 1, 1, faiss::METRIC_L2); + cpuIndex.nprobe = 1; - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); - } + gpuIndex.copyTo(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); } -void copyFromTest(bool useFloat16CoarseQuantizer) { +void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -280,55 +255,67 @@ void copyFromTest(bool useFloat16CoarseQuantizer) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, 1, 1, faiss::METRIC_L2, config); - gpuIndex.nprobe = 1; - - gpuIndex.copyFrom(&cpuIndex); - - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); - - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, opt.dim); - EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); - EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); - - testIVFEquality(cpuIndex, gpuIndex); - - // Query both objects; results should be equivalent - bool compFloat16 = useFloat16CoarseQuantizer; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - compFloat16 ? 0.70f : 0.1f, - compFloat16 ? 0.30f : 0.015f); - } + config.use_raft = use_raft; + + faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); + gpuIndex.nprobe = 1; + + gpuIndex.copyFrom(&cpuIndex); + + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, opt.numAdd); + + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, opt.dim); + EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists()); + EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe); + + testIVFEquality(cpuIndex, gpuIndex); + + // Query both objects; results should be equivalent + bool compFloat16 = useFloat16CoarseQuantizer; + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + compFloat16 ? 0.70f : 0.1f, + compFloat16 ? 0.30f : 0.015f); } TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) { - addTest(faiss::METRIC_L2, false); + addTest(faiss::METRIC_L2, false, false); + +#if defined USE_NVIDIA_RAFT + addTest(faiss::METRIC_L2, false, true); +#endif } TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) { - addTest(faiss::METRIC_INNER_PRODUCT, false); + addTest(faiss::METRIC_INNER_PRODUCT, false, false); + +#if defined USE_NVIDIA_RAFT + addTest(faiss::METRIC_INNER_PRODUCT, false, true); +#endif } TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) { - addTest(faiss::METRIC_L2, true); + addTest(faiss::METRIC_L2, true, false); + +#if defined USE_NVIDIA_RAFT + addTest(faiss::METRIC_L2, true, true); +#endif } TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) { - addTest(faiss::METRIC_INNER_PRODUCT, true); + addTest(faiss::METRIC_INNER_PRODUCT, true, false); + +#if defined USE_NVIDIA_RAFT + addTest(faiss::METRIC_INNER_PRODUCT, true, true); +#endif } // @@ -336,11 +323,23 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) { // TEST(TestGpuIndexIVFFlat, Float32_Query_L2) { - queryTest(Options(), faiss::METRIC_L2, false); + Options opt; + queryTest(opt, faiss::METRIC_L2, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, false); +#endif } TEST(TestGpuIndexIVFFlat, Float32_Query_IP) { - queryTest(Options(), faiss::METRIC_INNER_PRODUCT, false); + Options opt; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); +#endif } TEST(TestGpuIndexIVFFlat, LargeBatch) { @@ -348,16 +347,33 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) { opt.dim = 3; opt.numQuery = 100000; queryTest(opt, faiss::METRIC_L2, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, false); +#endif } // float16 coarse quantizer TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) { - queryTest(Options(), faiss::METRIC_L2, true); + Options opt; + queryTest(opt, faiss::METRIC_L2, true); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, true); +#endif } TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) { - queryTest(Options(), faiss::METRIC_INNER_PRODUCT, true); + Options opt; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, true); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, true); +#endif } // @@ -369,24 +385,44 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) { Options opt; opt.dim = 64; queryTest(opt, faiss::METRIC_L2, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, false); +#endif } TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) { Options opt; opt.dim = 64; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); +#endif } TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) { Options opt; opt.dim = 128; queryTest(opt, faiss::METRIC_L2, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, false); +#endif } TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) { Options opt; opt.dim = 128; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); +#endif } // @@ -394,11 +430,19 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) { // TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) { - copyToTest(false); + copyToTest(false, false); + +#if defined USE_NVIDIA_RAFT + copyToTest(false, true); +#endif } TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) { - copyFromTest(false); + copyFromTest(false, false); + +#if defined USE_NVIDIA_RAFT + copyFromTest(false, true); +#endif } TEST(TestGpuIndexIVFFlat, Float32_negative) { @@ -432,38 +476,34 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = opt.nprobe; - - // Construct a positive test set - auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = opt.nprobe; - // Put all vecs on positive size - for (auto& f : queryVecs) { - f = std::abs(f); - } + // Construct a positive test set + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - bool compFloat16 = false; - faiss::gpu::compareIndices( - queryVecs, - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - compFloat16 ? 0.99f : 0.1f, - compFloat16 ? 0.65f : 0.015f); + // Put all vecs on positive size + for (auto& f : queryVecs) { + f = std::abs(f); } + + bool compFloat16 = false; + faiss::gpu::compareIndices( + queryVecs, + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.99f : 0.1f, + compFloat16 ? 0.65f : 0.015f); } // @@ -484,33 +524,29 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.nprobe = opt.nprobe; + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.nprobe = opt.nprobe; - gpuIndex.train(opt.numTrain, trainVecs.data()); - gpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.train(opt.numTrain, trainVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); - int numQuery = 10; - std::vector nans( - numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + int numQuery = 10; + std::vector nans( + numQuery * opt.dim, std::numeric_limits::quiet_NaN()); - std::vector distances(numQuery * opt.k, 0); - std::vector indices(numQuery * opt.k, 0); + std::vector distances(numQuery * opt.k, 0); + std::vector indices(numQuery * opt.k, 0); - gpuIndex.search( - numQuery, nans.data(), opt.k, distances.data(), indices.data()); + gpuIndex.search( + numQuery, nans.data(), opt.k, distances.data(), indices.data()); - for (int q = 0; q < numQuery; ++q) { - for (int k = 0; k < opt.k; ++k) { - EXPECT_EQ(indices[q * opt.k + k], -1); - EXPECT_EQ( - distances[q * opt.k + k], - std::numeric_limits::max()); - } + for (int q = 0; q < numQuery; ++q) { + for (int k = 0; k < opt.k; ++k) { + EXPECT_EQ(indices[q * opt.k + k], -1); + EXPECT_EQ( + distances[q * opt.k + k], + std::numeric_limits::max()); } } } @@ -526,44 +562,38 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); - for (int i = 0; i < opt.raftOpt.size(); i++) { - config.use_raft = opt.raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.nprobe = opt.nprobe; + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.nprobe = opt.nprobe; - int numNans = 10; - std::vector nans( - numNans * opt.dim, std::numeric_limits::quiet_NaN()); + int numNans = 10; + std::vector nans( + numNans * opt.dim, std::numeric_limits::quiet_NaN()); - // Make one vector valid (not the first vector, in order to test offset - // issues), which should actually add - for (int i = 0; i < opt.dim; ++i) { - nans[opt.dim + i] = i; - } - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - gpuIndex.train(opt.numTrain, trainVecs.data()); - - // should not crash - EXPECT_EQ(gpuIndex.ntotal, 0); - gpuIndex.add(numNans, nans.data()); - - std::vector queryVecs = - faiss::gpu::randVecs(opt.numQuery, opt.dim); - std::vector distance(opt.numQuery * opt.k, 0); - std::vector indices(opt.numQuery * opt.k, 0); - - // should not crash - gpuIndex.search( - opt.numQuery, - queryVecs.data(), - opt.k, - distance.data(), - indices.data()); + // Make one vector valid (not the first vector, in order to test offset + // issues), which should actually add + for (int i = 0; i < opt.dim; ++i) { + nans[opt.dim + i] = i; } + + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + gpuIndex.train(opt.numTrain, trainVecs.data()); + + // should not crash + EXPECT_EQ(gpuIndex.ntotal, 0); + gpuIndex.add(numNans, nans.data()); + + std::vector queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + std::vector distance(opt.numQuery * opt.k, 0); + std::vector indices(opt.numQuery * opt.k, 0); + + // should not crash + gpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + distance.data(), + indices.data()); } TEST(TestGpuIndexIVFFlat, UnifiedMemory) { @@ -586,11 +616,6 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { int numQuery = 10; int k = 10; int nprobe = 8; - std::vector raftOpt; - raftOpt.push_back(false); - #if defined USE_NVIDIA_RAFT - raftOpt.push_back(true); - #endif std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); @@ -609,25 +634,22 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; config.memorySpace = faiss::gpu::MemorySpace::Unified; - for (int i = 0; i < raftOpt.size(); i++) { - config.use_raft = raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); - gpuIndex.copyFrom(&cpuIndex); - gpuIndex.nprobe = nprobe; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); - } + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + gpuIndex.copyFrom(&cpuIndex); + gpuIndex.nprobe = nprobe; + + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); } TEST(TestGpuIndexIVFFlat, LongIVFList) { @@ -652,11 +674,6 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { size_t numTrain = 100; int numQuery = 5; int k = 10; - std::vector raftOpt; - raftOpt.push_back(false); - #if defined USE_NVIDIA_RAFT - raftOpt.push_back(true); - #endif std::vector trainVecs = faiss::gpu::randVecs(numTrain, dim); std::vector addVecs = faiss::gpu::randVecs(numAdd, dim); @@ -674,26 +691,23 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; - for (int i = 0; i < raftOpt.size(); i++) { - config.use_raft = raftOpt[i]; - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, dim, numCentroids, faiss::METRIC_L2, config); - gpuIndex.train(numTrain, trainVecs.data()); - gpuIndex.add(numAdd, addVecs.data()); - gpuIndex.nprobe = 1; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); - } + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + gpuIndex.train(numTrain, trainVecs.data()); + gpuIndex.add(numAdd, addVecs.data()); + gpuIndex.nprobe = 1; + + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); } int main(int argc, char** argv) { @@ -703,4 +717,4 @@ int main(int argc, char** argv) { faiss::gpu::setTestSeed(100); return RUN_ALL_TESTS(); -} +} \ No newline at end of file From 8cf7e057596abe909555c1891dcbc7ab15ddedcf Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Tue, 15 Aug 2023 17:03:54 -0700 Subject: [PATCH 78/87] update quantizer --- faiss/gpu/GpuIndexIVF.cu | 50 ++++++++++++++--- faiss/gpu/GpuIndexIVFFlat.cu | 75 +++++++++----------------- faiss/gpu/impl/RaftIVFFlat.cu | 61 +++++++++++---------- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 1 + 4 files changed, 102 insertions(+), 85 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 935c255b8f..c83008307d 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -16,6 +16,11 @@ #include #include +#if defined USE_NVIDIA_RAFT +#include +#include +#endif + namespace faiss { namespace gpu { @@ -212,7 +217,6 @@ void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { index->nprobe = nprobe; FAISS_ASSERT(quantizer); - printf("index -> own_fields %d\n", index->own_fields); if (index->own_fields) { delete index->quantizer; index->quantizer = nullptr; @@ -445,14 +449,46 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { printf("Training IVF quantizer on %ld vectors in %dD\n", n, d); } - // leverage the CPU-side k-means code, which works for the GPU - // flat index as well quantizer->reset(); - Clustering clus(this->d, nlist, this->cp); - clus.verbose = verbose; - clus.train(n, x, *quantizer); - quantizer->is_trained = true; +#if defined USE_NVIDIA_RAFT + + if (config_.use_raft) { + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = metric_type == faiss::METRIC_L2 + ? raft::distance::DistanceType::L2Expanded + : raft::distance::DistanceType::InnerProduct; + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_trainset_fraction = 1.0; + raft_idx_params.kmeans_n_iters = cp.niter; + raft_idx_params.adaptive_centers = !cp.frozen_centroids; + + auto raft_index = raft::neighbors::ivf_flat::build( + raft_handle, raft_idx_params, x, n, (idx_t)d); + + raft_handle.sync_stream(); + + quantizer->train(nlist, raft_index.centers().data_handle()); + quantizer->add(nlist, raft_index.centers().data_handle()); + } else +#else + if (config_.use_raft) { + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); + } else +#endif + { + // leverage the CPU-side k-means code, which works for the GPU + // flat index as well + Clustering clus(this->d, nlist, this->cp); + clus.verbose = verbose; + clus.train(n, x, *quantizer); + } + quantizer->is_trained = true; FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 3a18aa0e90..750096e153 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -17,11 +17,6 @@ #if defined USE_NVIDIA_RAFT #include -#include -#include -#include -#include -#include #endif #include @@ -90,6 +85,8 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); + baseIndex_ = std::static_pointer_cast(index_); + updateQuantizer(); } } @@ -110,7 +107,6 @@ void GpuIndexIVFFlat::set_index_( #if defined USE_NVIDIA_RAFT if (config_.use_raft) { - printf("inside GpuIndexIVFFlat's set_index_ use_raft = true\n"); index_.reset(new RaftIVFFlat( resources, dim, @@ -122,7 +118,6 @@ void GpuIndexIVFFlat::set_index_( interleavedLayout, indicesOptions, space)); - own_fields = false; } else #else if (config_.use_raft) { @@ -130,8 +125,7 @@ void GpuIndexIVFFlat::set_index_( "RAFT has not been compiled into the current version so it cannot be used."); } else #endif - { - printf("inside GpuIndexIVFFlat's set_index_ use_raft = false\n"); + { index_.reset(new IVFFlat( resources, dim, @@ -144,8 +138,6 @@ void GpuIndexIVFFlat::set_index_( indicesOptions, space)); } - baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); } void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { @@ -158,20 +150,15 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - printf("Inside GpuIndexIVFFlat's copyFrom\n"); DeviceScope scope(config_.device); // This will copy GpuIndexIVF data such as the coarse quantizer GpuIndexIVF::copyFrom(index); - printf("GpuIndexIVF's copyFrom done\n"); - // Clear out our old data index_.reset(); baseIndex_.reset(); - printf("indices reset\n"); - // The other index might not be trained if (!index->is_trained) { FAISS_ASSERT(!is_trained); @@ -193,9 +180,10 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); + baseIndex_ = std::static_pointer_cast(index_); + updateQuantizer(); - // Copy all of the IVF data - printf("Copying inverted lists from cpu index to FAISS gpu index ivfflat\n"); + // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); } @@ -264,15 +252,25 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { FAISS_ASSERT(!index_); - // FIXME: GPUize more of this - // First, make sure that the data is resident on the CPU, if it is not on - // the CPU, as we depend upon parts of the CPU code - if (!config_.use_raft) { +#if defined USE_NVIDIA_RAFT + if (config_.use_raft) { + // No need to copy the data to host + trainQuantizer_(n, x); + } else +#else + if (config_.use_raft) { + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); + } else +#endif + { + // FIXME: GPUize more of this + // First, make sure that the data is resident on the CPU, if it is not + // on the CPU, as we depend upon parts of the CPU code auto hostData = toHost( (float*)x, resources_->getDefaultStream(config_.device), {n, this->d}); - trainQuantizer_(n, hostData.data()); } @@ -288,38 +286,13 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { ivfFlatConfig_.interleavedLayout, ivfFlatConfig_.indicesOptions, config_.memorySpace); + baseIndex_ = std::static_pointer_cast(index_); + updateQuantizer(); - if (!config_.use_raft && reserveMemoryVecs_) { + if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } -#if defined USE_NVIDIA_RAFT - - if (config_.use_raft) { - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); - - raft::neighbors::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = raft::distance::DistanceType::L2Expanded; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_trainset_fraction = 1.0; - raft_idx_params.kmeans_n_iters = cp.niter; - raft_idx_params.adaptive_centers = !cp.frozen_centroids; - - std::dynamic_pointer_cast(index_)->set_index_( - std::make_optional< - raft::neighbors::ivf_flat::index>( - raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (idx_t)d))); - } -#else - if (config_.use_raft) { - FAISS_THROW_MSG( - "RAFT has not been compiled into the current version so it cannot be used."); - } -#endif - this->is_trained = true; } diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 110a0f0ced..b17b35d5ee 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -121,33 +121,34 @@ void RaftIVFFlat::search( /// Identify NaN rows and mask their nearest neighbors auto nan_flag = raft::make_device_vector(raft_handle, numQueries); -// validRowIndices_(queries, nan_flag.data_handle()); - -// raft::linalg::map_offset( -// raft_handle, -// raft::make_device_vector_view(outIndices.data(), numQueries * k_), -// [nan_flag = nan_flag.data_handle(), -// out_inds = outIndices.data(), -// k_] __device__(uint32_t i) { -// uint32_t row = i / k_; -// if (!nan_flag[row]) -// return idx_t(-1); -// return out_inds[i]; -// }); - -// float max_val = std::numeric_limits::max(); -// raft::linalg::map_offset( -// raft_handle, -// raft::make_device_vector_view(outDistances.data(), numQueries * k_), -// [nan_flag = nan_flag.data_handle(), -// out_dists = outDistances.data(), -// max_val, -// k_] __device__(uint32_t i) { -// uint32_t row = i / k_; -// if (!nan_flag[row]) -// return max_val; -// return out_dists[i]; -// }); + validRowIndices_(queries, nan_flag.data_handle()); + + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outIndices.data(), numQueries * k_), + [nan_flag = nan_flag.data_handle(), + out_inds = outIndices.data(), + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return idx_t(-1); + return out_inds[i]; + }); + + float max_val = std::numeric_limits::max(); + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outDistances.data(), numQueries * k_), + [nan_flag = nan_flag.data_handle(), + out_dists = outDistances.data(), + max_val, + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return max_val; + return out_dists[i]; + }); + raft_handle.sync_stream(); } /// Classify and encode/add vectors to our IVF lists. @@ -158,6 +159,8 @@ idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { + + raft::print_device_vector("raft_centers from addVectors", raft_knn_index.value().centers().data_handle(), dim_ * this->numLists_, std::cout); idx_t n_rows = vecs.getSize(0); const raft::device_resources& raft_handle = @@ -173,6 +176,7 @@ idx_t RaftIVFFlat::addVectors( nan_flag.data_handle(), nan_flag.data_handle() + n_rows, 0); + printf("n_rows_valid %d %d\n", n_rows_valid, n_rows); if (n_rows_valid < n_rows) { printf("NaN values found"); @@ -366,6 +370,9 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { buf_host.data(), total_elems, stream); + thrust::fill_n(handle.get_thrust_policy(), raft_knn_index.value().list_sizes().data_handle(), pams.n_lists, 0); + + raft::print_device_vector("raft_idx_centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); } void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index e3b9540d46..4120f4ba73 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -199,6 +199,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + printf("opt.numCentroids %d", opt.numCentroids); gpuIndex.train(opt.numTrain, trainVecs.data()); gpuIndex.add(opt.numAdd, addVecs.data()); gpuIndex.nprobe = opt.nprobe; From a17b1f32b22ac43bd90bd2cdaaf9eb7e8de0575b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 21 Aug 2023 12:33:23 -0700 Subject: [PATCH 79/87] All except LongIVFList passing --- faiss/gpu/impl/RaftIVFFlat.cu | 70 +++++++++++----------- faiss/gpu/impl/RaftIVFFlat.cuh | 8 +-- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 81 +++++++++++++++++++++----- 3 files changed, 105 insertions(+), 54 deletions(-) diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index b17b35d5ee..3861a2283c 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -46,9 +46,8 @@ #include #include -#include #include -#include +#include namespace faiss { namespace gpu { @@ -73,7 +72,9 @@ RaftIVFFlat::RaftIVFFlat( scalarQ, interleavedLayout, indicesOptions, - space) {printf("RaftIVFFlat constructor called\n"); reset();} + space) { + reset(); +} RaftIVFFlat::~RaftIVFFlat() {} @@ -103,12 +104,12 @@ void RaftIVFFlat::search( raft::neighbors::ivf_flat::search_params pams; pams.n_probes = nprobe; - auto queries_view = raft::make_device_matrix_view( - queries.data(), numQueries, cols); - auto out_inds_view = raft::make_device_matrix_view( - outIndices.data(), numQueries, k_); - auto out_dists_view = raft::make_device_matrix_view( - outDistances.data(), numQueries, k_); + auto queries_view = raft::make_device_matrix_view( + queries.data(), (idx_t)numQueries, (idx_t)cols); + auto out_inds_view = raft::make_device_matrix_view( + outIndices.data(), (idx_t)numQueries, (idx_t)k_); + auto out_dists_view = raft::make_device_matrix_view( + outDistances.data(), (idx_t)numQueries, (idx_t)k_); raft::neighbors::ivf_flat::search( raft_handle, @@ -148,7 +149,6 @@ void RaftIVFFlat::search( return max_val; return out_dists[i]; }); - raft_handle.sync_stream(); } /// Classify and encode/add vectors to our IVF lists. @@ -159,8 +159,6 @@ idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { - - raft::print_device_vector("raft_centers from addVectors", raft_knn_index.value().centers().data_handle(), dim_ * this->numLists_, std::cout); idx_t n_rows = vecs.getSize(0); const raft::device_resources& raft_handle = @@ -176,10 +174,8 @@ idx_t RaftIVFFlat::addVectors( nan_flag.data_handle(), nan_flag.data_handle() + n_rows, 0); - printf("n_rows_valid %d %d\n", n_rows_valid, n_rows); if (n_rows_valid < n_rows) { - printf("NaN values found"); auto gather_indices = raft::make_device_vector( raft_handle, n_rows_valid); @@ -236,13 +232,14 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const { resources_->getRaftHandleCurrentDevice(); uint32_t size; - raft::copy( + raft::update_host( &size, raft_knn_index.value().list_sizes().data_handle() + listId, 1, raft_handle.get_stream()); raft_handle.sync_stream(); - return int(size); + + return static_cast(size); } /// Return the list indices of a particular list back to the CPU @@ -256,6 +253,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { std::vector vec(listSize); + // fetch the list indices ptr on host idx_t* list_indices_ptr; // fetch the list indices ptr on host @@ -268,6 +266,7 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { raft::update_host(vec.data(), list_indices_ptr, listSize, stream); raft_handle.sync_stream(); + return vec; } @@ -278,7 +277,6 @@ std::vector RaftIVFFlat::getListVectorData( if (gpuFormat) { FAISS_THROW_MSG("gpuFormat is not suppported for raft indices"); } - printf("inside getlistvectordata of raft"); FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = @@ -335,9 +333,9 @@ void RaftIVFFlat::searchPreassigned( void RaftIVFFlat::updateQuantizer(Index* quantizer) { idx_t quantizer_ntotal = quantizer->ntotal; - const raft::device_resources& handle = + const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - auto stream = handle.get_stream(); + auto stream = raft_handle.get_stream(); auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); @@ -359,7 +357,23 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { FAISS_THROW_MSG("Metric is not supported."); } - raft_knn_index.emplace(handle, pams, (uint32_t)this->dim_); + raft_knn_index.emplace(raft_handle, pams, (uint32_t)this->dim_); + + cudaMemsetAsync( + raft_knn_index.value().list_sizes().data_handle(), + 0, + raft_knn_index.value().list_sizes().size() * sizeof(uint32_t), + stream); + cudaMemsetAsync( + raft_knn_index.value().data_ptrs().data_handle(), + 0, + raft_knn_index.value().data_ptrs().size() * sizeof(float*), + stream); + cudaMemsetAsync( + raft_knn_index.value().inds_ptrs().data_handle(), + 0, + raft_knn_index.value().inds_ptrs().size() * sizeof(idx_t*), + stream); /// Copy (reconstructed) centroids over, rather than re-training std::vector buf_host(total_elems); @@ -370,9 +384,6 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) { buf_host.data(), total_elems, stream); - thrust::fill_n(handle.get_thrust_policy(), raft_knn_index.value().list_sizes().data_handle(), pams.n_lists, 0); - - raft::print_device_vector("raft_idx_centers", raft_knn_index.value().centers().data_handle(), total_elems, std::cout); } void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { @@ -435,7 +446,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { if (this->metric_ == faiss::METRIC_L2) { raft_knn_index.value().allocate_center_norms(raft_handle); raft::linalg::rowNorm( - raft_knn_index.value().center_norms()->data_handle(), + raft_knn_index.value().center_norms().value().data_handle(), raft_knn_index.value().centers().data_handle(), raft_knn_index.value().dim(), (uint32_t)nlist, @@ -443,12 +454,6 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { true, raft_handle.get_stream()); } - raft_handle.sync_stream(); -} - -void RaftIVFFlat::set_index_( - std::optional> idx) { - raft_knn_index.emplace(std::move(idx.value())); } size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const { @@ -478,8 +483,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( // This list must already exist FAISS_ASSERT(raft_knn_index.has_value()); - printf("getListLength(listId), %d\n", getListLength(listId)); - // This list must currently be empty FAISS_ASSERT(getListLength(listId) == 0); @@ -520,7 +523,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( interleaved_codes.data(), gpuListSizeInBytes, stream); - raft_handle.sync_stream(); /// Handle the indices as well idx_t* list_indices_ptr; @@ -532,8 +534,8 @@ void RaftIVFFlat::addEncodedVectorsToList_( 1, stream); raft_handle.sync_stream(); + raft::update_device(list_indices_ptr, indices, numVecs, stream); - raft_handle.sync_stream(); } void RaftIVFFlat::validRowIndices_( diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index a9c013f68a..3aba501c9f 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -22,14 +22,14 @@ #pragma once -#include - #include #include #include #include +#include + #include namespace faiss { @@ -106,10 +106,6 @@ class RaftIVFFlat : public IVFFlat { /// Copy all inverted lists from a CPU representation to ourselves void copyInvertedListsFrom(const InvertedLists* ivf) override; - /// Update the raft index - void set_index_( - std::optional> idx); - /// Filter out matrix rows containing NaN values void validRowIndices_(Tensor& vecs, bool* nan_flag); diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 4120f4ba73..109a5eaf22 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -348,11 +348,6 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) { opt.dim = 3; opt.numQuery = 100000; queryTest(opt, faiss::METRIC_L2, false); - -#if defined USE_NVIDIA_RAFT - opt.use_raft = true; - queryTest(opt, faiss::METRIC_L2, false); -#endif } // float16 coarse quantizer @@ -558,15 +553,6 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { faiss::gpu::StandardGpuResources res; res.noTempMemory(); - faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = opt.device; - config.indicesOptions = opt.indicesOpt; - config.flatConfig.useFloat16 = faiss::gpu::randBool(); - - faiss::gpu::GpuIndexIVFFlat gpuIndex( - &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - gpuIndex.nprobe = opt.nprobe; - int numNans = 10; std::vector nans( numNans * opt.dim, std::numeric_limits::quiet_NaN()); @@ -578,6 +564,14 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { } std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + + faiss::gpu::GpuIndexIVFFlatConfig config; + config.device = opt.device; + config.indicesOptions = opt.indicesOpt; + config.flatConfig.useFloat16 = faiss::gpu::randBool(); + faiss::gpu::GpuIndexIVFFlat gpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + gpuIndex.nprobe = opt.nprobe; gpuIndex.train(opt.numTrain, trainVecs.data()); // should not crash @@ -595,6 +589,26 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { opt.k, distance.data(), indices.data()); + +#if defined USE_NVIDIA_RAFT +config.use_raft = true; +faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + raftGpuIndex.nprobe = opt.nprobe; + raftGpuIndex.train(opt.numTrain, trainVecs.data()); + + // should not crash + EXPECT_EQ(raftGpuIndex.ntotal, 0); + raftGpuIndex.add(numNans, nans.data()); + + // should not crash + raftGpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + distance.data(), + indices.data()); +#endif } TEST(TestGpuIndexIVFFlat, UnifiedMemory) { @@ -651,6 +665,25 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { kF32MaxRelErr, 0.1f, 0.015f); + +#if defined USE_NVIDIA_RAFT +config.use_raft = true; +faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + raftGpuIndex.copyFrom(&cpuIndex); + raftGpuIndex.nprobe = nprobe; + + faiss::gpu::compareIndices( + cpuIndex, + raftGpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); +#endif } TEST(TestGpuIndexIVFFlat, LongIVFList) { @@ -709,6 +742,26 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { kF32MaxRelErr, 0.1f, 0.015f); + +#if defined USE_NVIDIA_RAFT + config.use_raft = true; + faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + &res, dim, numCentroids, faiss::METRIC_L2, config); + raftGpuIndex.train(numTrain, trainVecs.data()); + raftGpuIndex.add(numAdd, addVecs.data()); + raftGpuIndex.nprobe = 1; + + faiss::gpu::compareIndices( + cpuIndex, + raftGpuIndex, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); +#endif } int main(int argc, char** argv) { From 3c33ebb98fa65e8c47a900b9acd41d3ea99b193b Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 21 Aug 2023 15:41:28 -0700 Subject: [PATCH 80/87] Formatting --- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 17 ++++++++++------- faiss/gpu/test/TestUtils.cpp | 26 -------------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 109a5eaf22..7f2ae81196 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -64,7 +64,7 @@ struct Options { str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " << dim << " numCentroids " << numCentroids << " nprobe " << nprobe << " numQuery " << numQuery << " k " << k << " indicesOpt " - << indicesOpt <<" use_raft "<< use_raft; + << indicesOpt << " use_raft " << use_raft; return str.str(); } @@ -133,7 +133,10 @@ void queryTest( } } -void addTest(faiss::MetricType metricType, bool useFloat16CoarseQuantizer, bool use_raft) { +void addTest( + faiss::MetricType metricType, + bool useFloat16CoarseQuantizer, + bool use_raft) { for (int tries = 0; tries < 2; ++tries) { Options opt; @@ -591,8 +594,8 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { indices.data()); #if defined USE_NVIDIA_RAFT -config.use_raft = true; -faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + config.use_raft = true; + faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); raftGpuIndex.nprobe = opt.nprobe; raftGpuIndex.train(opt.numTrain, trainVecs.data()); @@ -667,8 +670,8 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { 0.015f); #if defined USE_NVIDIA_RAFT -config.use_raft = true; -faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + config.use_raft = true; + faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); raftGpuIndex.copyFrom(&cpuIndex); raftGpuIndex.nprobe = nprobe; @@ -744,7 +747,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { 0.015f); #if defined USE_NVIDIA_RAFT - config.use_raft = true; + config.use_raft = true; faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); raftGpuIndex.train(numTrain, trainVecs.data()); diff --git a/faiss/gpu/test/TestUtils.cpp b/faiss/gpu/test/TestUtils.cpp index 04f136782c..c81d34339e 100644 --- a/faiss/gpu/test/TestUtils.cpp +++ b/faiss/gpu/test/TestUtils.cpp @@ -114,32 +114,6 @@ void compareIndices( testDistance.data(), testIndices.data()); - int start_idx = 17 * k; - int stop_idx = start_idx + k; - printf("ref inds: ["); - for (int i = start_idx; i < stop_idx; i++) { - printf("%d, ", int(refIndices[i])); - } - printf("]\n"); - - printf("test inds: ["); - for (int i = start_idx; i < stop_idx; i++) { - printf("%d, ", int(testIndices[i])); - } - printf("]\n"); - - printf("ref dists: ["); - for (int i = start_idx; i < stop_idx; i++) { - printf("%f, ", float(refDistance[i])); - } - printf("]\n"); - - printf("test dists: ["); - for (int i = start_idx; i < stop_idx; i++) { - printf("%f, ", float(testDistance[i])); - } - printf("]\n"); - faiss::gpu::compareLists( refDistance.data(), refIndices.data(), From 971a6b2d1f2fa696a9561b7c87f563f22533a9e9 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 21 Aug 2023 15:45:03 -0700 Subject: [PATCH 81/87] Format --- faiss/gpu/test/TestGpuIndexFlat.cpp | 918 ++++++++++++++-------------- 1 file changed, 459 insertions(+), 459 deletions(-) diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index 2ab616caf4..6d9c83e547 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -1,9 +1,9 @@ /** -* Copyright (c) Facebook, Inc. and its affiliates. -* -* This source code is licensed under the MIT license found in the -* LICENSE file in the root directory of this source tree. -*/ + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ #include #include @@ -21,644 +21,644 @@ constexpr float kF16MaxRelErr = 0.07f; constexpr float kF32MaxRelErr = 6e-3f; struct TestFlatOptions { - TestFlatOptions() - : metric(faiss::MetricType::METRIC_L2), - metricArg(0), - useFloat16(false), - numVecsOverride(-1), - numQueriesOverride(-1), - kOverride(-1), - dimOverride(-1), - use_raft(false) {} - - faiss::MetricType metric; - float metricArg; - - bool useFloat16; - int numVecsOverride; - int numQueriesOverride; - int kOverride; - int dimOverride; - bool use_raft; + TestFlatOptions() + : metric(faiss::MetricType::METRIC_L2), + metricArg(0), + useFloat16(false), + numVecsOverride(-1), + numQueriesOverride(-1), + kOverride(-1), + dimOverride(-1), + use_raft(false) {} + + faiss::MetricType metric; + float metricArg; + + bool useFloat16; + int numVecsOverride; + int numQueriesOverride; + int kOverride; + int dimOverride; + bool use_raft; }; void testFlat(const TestFlatOptions& opt) { - int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride - : faiss::gpu::randVal(1000, 5000); - int dim = opt.dimOverride > 0 ? opt.dimOverride - : faiss::gpu::randVal(50, 800); - int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride - : faiss::gpu::randVal(1, 512); - - // Due to loss of precision in a float16 accumulator, for large k, - // the number of differences is pretty huge. Restrict ourselves to a - // fairly small `k` for float16 - int k = opt.useFloat16 - ? std::min(faiss::gpu::randVal(1, 50), numVecs) - : std::min( - faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()), - numVecs); - if (opt.kOverride > 0) { - k = opt.kOverride; - } - - faiss::IndexFlat cpuIndex(dim, opt.metric); - cpuIndex.metric_arg = opt.metricArg; - - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.useFloat16 = opt.useFloat16; - config.use_raft = opt.use_raft; - - faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config); - gpuIndex.metric_arg = opt.metricArg; - - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndex.add(numVecs, vecs.data()); - gpuIndex.add(numVecs, vecs.data()); - - std::stringstream str; - str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs " - << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16 - << " numQuery " << numQuery << " k " << k; - - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - numQuery, - dim, - k, - str.str(), - opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, - // FIXME: the fp16 bounds are - // useless when math (the accumulator) is - // in fp16. Figure out another way to test - opt.useFloat16 ? 0.99f : 0.1f, - opt.useFloat16 ? 0.65f : 0.015f); + int numVecs = opt.numVecsOverride > 0 ? opt.numVecsOverride + : faiss::gpu::randVal(1000, 5000); + int dim = opt.dimOverride > 0 ? opt.dimOverride + : faiss::gpu::randVal(50, 800); + int numQuery = opt.numQueriesOverride > 0 ? opt.numQueriesOverride + : faiss::gpu::randVal(1, 512); + + // Due to loss of precision in a float16 accumulator, for large k, + // the number of differences is pretty huge. Restrict ourselves to a + // fairly small `k` for float16 + int k = opt.useFloat16 + ? std::min(faiss::gpu::randVal(1, 50), numVecs) + : std::min( + faiss::gpu::randVal(1, faiss::gpu::getMaxKSelection()), + numVecs); + if (opt.kOverride > 0) { + k = opt.kOverride; + } + + faiss::IndexFlat cpuIndex(dim, opt.metric); + cpuIndex.metric_arg = opt.metricArg; + + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = opt.useFloat16; + config.use_raft = opt.use_raft; + + faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config); + gpuIndex.metric_arg = opt.metricArg; + + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndex.add(numVecs, vecs.data()); + gpuIndex.add(numVecs, vecs.data()); + + std::stringstream str; + str << "metric " << opt.metric << " marg " << opt.metricArg << " numVecs " + << numVecs << " dim " << dim << " useFloat16 " << opt.useFloat16 + << " numQuery " << numQuery << " k " << k; + + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + numQuery, + dim, + k, + str.str(), + opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + opt.useFloat16 ? 0.99f : 0.1f, + opt.useFloat16 ? 0.65f : 0.015f); } TEST(TestGpuIndexFlat, IP_Float32) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; - opt.useFloat16 = false; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, L1_Float32) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L1; - opt.useFloat16 = false; + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L1; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif } TEST(TestGpuIndexFlat, Lp_Float32) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_Lp; - opt.metricArg = 5; - opt.useFloat16 = false; + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_Lp; + opt.metricArg = 5; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif } TEST(TestGpuIndexFlat, L2_Float32) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; + opt.useFloat16 = false; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // At least one test for the k > 1024 select TEST(TestGpuIndexFlat, L2_k_2048) { - if (faiss::gpu::getMaxKSelection() >= 2048) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.kOverride = 2048; - opt.dimOverride = 128; - opt.numVecsOverride = 10000; - - testFlat(opt); + if (faiss::gpu::getMaxKSelection() >= 2048) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.kOverride = 2048; + opt.dimOverride = 128; + opt.numVecsOverride = 10000; + + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test specialized k == 1 codepath TEST(TestGpuIndexFlat, L2_Float32_K1) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.kOverride = 1; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.kOverride = 1; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, IP_Float16) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; - opt.useFloat16 = true; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT; + opt.useFloat16 = true; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, L2_Float16) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = true; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = true; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test specialized k == 1 codepath TEST(TestGpuIndexFlat, L2_Float16_K1) { - for (int tries = 0; tries < 3; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = true; - opt.kOverride = 1; + for (int tries = 0; tries < 3; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = true; + opt.kOverride = 1; - testFlat(opt); + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } // test tiling along a huge vector set TEST(TestGpuIndexFlat, L2_Tiling) { - for (int tries = 0; tries < 2; ++tries) { - TestFlatOptions opt; - opt.metric = faiss::MetricType::METRIC_L2; - opt.useFloat16 = false; - opt.numVecsOverride = 1000000; - - // keep the rest of the problem reasonably small - opt.numQueriesOverride = 4; - opt.dimOverride = 64; - opt.kOverride = 64; - - testFlat(opt); + for (int tries = 0; tries < 2; ++tries) { + TestFlatOptions opt; + opt.metric = faiss::MetricType::METRIC_L2; + opt.useFloat16 = false; + opt.numVecsOverride = 1000000; + + // keep the rest of the problem reasonably small + opt.numQueriesOverride = 4; + opt.dimOverride = 64; + opt.kOverride = 64; + + testFlat(opt); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; - testFlat(opt); + opt.use_raft = true; + testFlat(opt); #endif - } + } } TEST(TestGpuIndexFlat, QueryEmpty) { - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - faiss::gpu::GpuIndexFlatConfig config; - config.device = 0; - config.useFloat16 = false; - int dim = 128; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + faiss::gpu::GpuIndexFlatConfig config; + config.device = 0; + config.useFloat16 = false; + int dim = 128; + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - // Querying an empty index should not blow up, and just return - // (FLT_MAX, -1) - int numQuery = 10; - int k = 50; - std::vector queries(numQuery * dim, 1.0f); + // Querying an empty index should not blow up, and just return + // (FLT_MAX, -1) + int numQuery = 10; + int k = 50; + std::vector queries(numQuery * dim, 1.0f); - std::vector dist(numQuery * k, 0); - std::vector ind(numQuery * k); + std::vector dist(numQuery * k, 0); + std::vector ind(numQuery * k); - gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data()); + gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data()); - for (auto d : dist) { - EXPECT_EQ(d, std::numeric_limits::max()); - } + for (auto d : dist) { + EXPECT_EQ(d, std::numeric_limits::max()); + } - for (auto i : ind) { - EXPECT_EQ(i, -1); - } + for (auto i : ind) { + EXPECT_EQ(i, -1); + } } void testCopyFrom(bool use_raft) { - int numVecs = faiss::gpu::randVal(100, 200); - int dim = faiss::gpu::randVal(1, 1000); + int numVecs = faiss::gpu::randVal(100, 200); + int dim = faiss::gpu::randVal(1, 1000); - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - faiss::IndexFlatL2 cpuIndex(dim); - cpuIndex.add(numVecs, vecs.data()); + faiss::IndexFlatL2 cpuIndex(dim); + cpuIndex.add(numVecs, vecs.data()); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; - // Fill with garbage values - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config); - gpuIndex.copyFrom(&cpuIndex); + // Fill with garbage values + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config); + gpuIndex.copyFrom(&cpuIndex); - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, numVecs); + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, numVecs); - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, dim); + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, dim); - std::vector gpuVals(numVecs * dim); - gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); + std::vector gpuVals(numVecs * dim); + gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); - std::vector cpuVals(numVecs * dim); - cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); + std::vector cpuVals(numVecs * dim); + cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); - // The CPU is the source of (float32) truth here, while the GPU index - // may be in float16 mode and thus was subject to rounding - if (useFloat16) { - EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals)); - } else { - // Should be exactly the same - EXPECT_EQ(gpuVals, cpuVals); - } - } + // The CPU is the source of (float32) truth here, while the GPU index + // may be in float16 mode and thus was subject to rounding + if (useFloat16) { + EXPECT_EQ(gpuVals, faiss::gpu::roundToHalf(cpuVals)); + } else { + // Should be exactly the same + EXPECT_EQ(gpuVals, cpuVals); + } + } } TEST(TestGpuIndexFlat, CopyFrom) { - testCopyFrom(false); + testCopyFrom(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, CopyFrom) { - testCopyFrom(true); + testCopyFrom(true); } #endif void testCopyTo(bool use_raft) { - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - int numVecs = faiss::gpu::randVal(100, 200); - int dim = faiss::gpu::randVal(1, 1000); + int numVecs = faiss::gpu::randVal(100, 200); + int dim = faiss::gpu::randVal(1, 1000); - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); - gpuIndex.add(numVecs, vecs.data()); + faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); + gpuIndex.add(numVecs, vecs.data()); - // Fill with garbage values - faiss::IndexFlatL2 cpuIndex(2000); - gpuIndex.copyTo(&cpuIndex); + // Fill with garbage values + faiss::IndexFlatL2 cpuIndex(2000); + gpuIndex.copyTo(&cpuIndex); - EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); - EXPECT_EQ(gpuIndex.ntotal, numVecs); + EXPECT_EQ(cpuIndex.ntotal, gpuIndex.ntotal); + EXPECT_EQ(gpuIndex.ntotal, numVecs); - EXPECT_EQ(cpuIndex.d, gpuIndex.d); - EXPECT_EQ(cpuIndex.d, dim); + EXPECT_EQ(cpuIndex.d, gpuIndex.d); + EXPECT_EQ(cpuIndex.d, dim); - std::vector gpuVals(numVecs * dim); - gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); + std::vector gpuVals(numVecs * dim); + gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data()); - std::vector cpuVals(numVecs * dim); - cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); + std::vector cpuVals(numVecs * dim); + cpuIndex.reconstruct_n(0, gpuIndex.ntotal, cpuVals.data()); - // The GPU is the source of truth here, so the float32 exact comparison - // even if the index uses float16 is ok - EXPECT_EQ(gpuVals, cpuVals); - } + // The GPU is the source of truth here, so the float32 exact comparison + // even if the index uses float16 is ok + EXPECT_EQ(gpuVals, cpuVals); + } } TEST(TestGpuIndexFlat, CopyTo) { - testCopyTo(false); + testCopyTo(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, CopyTo) { - testCopyTo(true); + testCopyTo(true); } #endif void testUnifiedMemory(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - if (!faiss::gpu::getFullUnifiedMemSupport(device)) { - return; - } + if (!faiss::gpu::getFullUnifiedMemSupport(device)) { + return; + } - int dim = 256; + int dim = 256; - // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to - // kernel indexing, so we can't test unified memory for memory - // oversubscription. - size_t numVecs = 50000; - int numQuery = 10; - int k = 10; + // FIXME: GpuIndexFlat doesn't support > 2^31 (vecs * dims) due to + // kernel indexing, so we can't test unified memory for memory + // oversubscription. + size_t numVecs = 50000; + int numQuery = 10; + int k = 10; - faiss::IndexFlatL2 cpuIndexL2(dim); + faiss::IndexFlatL2 cpuIndexL2(dim); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.memorySpace = faiss::gpu::MemorySpace::Unified; - config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.memorySpace = faiss::gpu::MemorySpace::Unified; + config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); + faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); - std::vector vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndexL2.add(numVecs, vecs.data()); - gpuIndexL2.add(numVecs, vecs.data()); + std::vector vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndexL2.add(numVecs, vecs.data()); + gpuIndexL2.add(numVecs, vecs.data()); - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndexL2, - gpuIndexL2, - numQuery, - dim, - k, - "Unified Memory", - kF32MaxRelErr, - 0.1f, - 0.015f); + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndexL2, + gpuIndexL2, + numQuery, + dim, + k, + "Unified Memory", + kF32MaxRelErr, + 0.1f, + 0.015f); } TEST(TestGpuIndexFlat, UnifiedMemory) { - testUnifiedMemory(false); + testUnifiedMemory(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, UnifiedMemory) { - testUnifiedMemory(true); + testUnifiedMemory(true); } #endif void testLargeIndex(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - // Skip this device if we do not have sufficient memory - constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024; + // Skip this device if we do not have sufficient memory + constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024; - if (faiss::gpu::getFreeMemory(device) < kMem) { - std::cout << "TestGpuIndexFlat.LargeIndex: skipping due " - "to insufficient device memory\n"; - return; - } + if (faiss::gpu::getFreeMemory(device) < kMem) { + std::cout << "TestGpuIndexFlat.LargeIndex: skipping due " + "to insufficient device memory\n"; + return; + } - std::cout << "Running LargeIndex test\n"; + std::cout << "Running LargeIndex test\n"; - size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size - size_t nb = 5000000; - size_t nq = 10; + size_t dim = 256; // each vec is sizeof(float) * 256 = 1 KiB in size + size_t nb = 5000000; + size_t nq = 10; - auto xb = faiss::gpu::randVecs(nb, dim); + auto xb = faiss::gpu::randVecs(nb, dim); - int k = 10; + int k = 10; - faiss::IndexFlatL2 cpuIndexL2(dim); + faiss::IndexFlatL2 cpuIndexL2(dim); - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; - faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config); - cpuIndexL2.add(nb, xb.data()); - gpuIndexL2.add(nb, xb.data()); + cpuIndexL2.add(nb, xb.data()); + gpuIndexL2.add(nb, xb.data()); - // To some extent, we depend upon the relative error for the test - // for float16 - faiss::gpu::compareIndices( - cpuIndexL2, - gpuIndexL2, - nq, - dim, - k, - "LargeIndex", - kF32MaxRelErr, - 0.1f, - 0.015f); + // To some extent, we depend upon the relative error for the test + // for float16 + faiss::gpu::compareIndices( + cpuIndexL2, + gpuIndexL2, + nq, + dim, + k, + "LargeIndex", + kF32MaxRelErr, + 0.1f, + 0.015f); } TEST(TestGpuIndexFlat, LargeIndex) { - testLargeIndex(false); + testLargeIndex(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, LargeIndex) { - testLargeIndex(true); + testLargeIndex(true); } #endif void testResidual(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.use_raft = use_raft; + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.use_raft = use_raft; - int dim = 32; - faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2); - faiss::gpu::GpuIndexFlat gpuIndex( - &res, dim, faiss::MetricType::METRIC_L2, config); + int dim = 32; + faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2); + faiss::gpu::GpuIndexFlat gpuIndex( + &res, dim, faiss::MetricType::METRIC_L2, config); - int numVecs = 100; - auto vecs = faiss::gpu::randVecs(numVecs, dim); - cpuIndex.add(numVecs, vecs.data()); - gpuIndex.add(numVecs, vecs.data()); + int numVecs = 100; + auto vecs = faiss::gpu::randVecs(numVecs, dim); + cpuIndex.add(numVecs, vecs.data()); + gpuIndex.add(numVecs, vecs.data()); - auto indexVecs = std::vector{0, 2, 4, 6, 8}; - auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim); + auto indexVecs = std::vector{0, 2, 4, 6, 8}; + auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim); - auto residualsCpu = std::vector(indexVecs.size() * dim); - auto residualsGpu = std::vector(indexVecs.size() * dim); + auto residualsCpu = std::vector(indexVecs.size() * dim); + auto residualsGpu = std::vector(indexVecs.size() * dim); - cpuIndex.compute_residual_n( - indexVecs.size(), - queryVecs.data(), - residualsCpu.data(), - indexVecs.data()); - gpuIndex.compute_residual_n( - indexVecs.size(), - queryVecs.data(), - residualsGpu.data(), - indexVecs.data()); + cpuIndex.compute_residual_n( + indexVecs.size(), + queryVecs.data(), + residualsCpu.data(), + indexVecs.data()); + gpuIndex.compute_residual_n( + indexVecs.size(), + queryVecs.data(), + residualsGpu.data(), + indexVecs.data()); - // Should be exactly the same, as this is just a single float32 subtraction - EXPECT_EQ(residualsCpu, residualsGpu); + // Should be exactly the same, as this is just a single float32 subtraction + EXPECT_EQ(residualsCpu, residualsGpu); } TEST(TestGpuIndexFlat, Residual) { - testResidual(false); + testResidual(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, Residual) { - testResidual(true); + testResidual(true); } #endif void testReconstruct(bool use_raft) { - // Construct on a random device to test multi-device, if we have - // multiple devices - int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - - faiss::gpu::StandardGpuResources res; - res.noTempMemory(); - - int dim = 32; - int numVecs = 100; - auto vecs = faiss::gpu::randVecs(numVecs, dim); - auto vecs16 = faiss::gpu::roundToHalf(vecs); - - for (bool useFloat16 : {false, true}) { - faiss::gpu::GpuIndexFlatConfig config; - config.device = device; - config.useFloat16 = useFloat16; - config.use_raft = use_raft; - - faiss::gpu::GpuIndexFlat gpuIndex( - &res, dim, faiss::MetricType::METRIC_L2, config); - - gpuIndex.add(numVecs, vecs.data()); - - // Test reconstruct - { - auto reconstructVecs = std::vector(dim); - gpuIndex.reconstruct(15, reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < dim; ++i) { - EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]); - } - } - - // Test reconstruct_n - if (false) { - auto reconstructVecs = std::vector((numVecs - 1) * dim); - - int startVec = 5; - int endVec = numVecs - 1; - int numReconstructVec = endVec - startVec + 1; - - gpuIndex.reconstruct_n( - startVec, numReconstructVec, reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < numReconstructVec; ++i) { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ( - reconstructVecs[i * dim + j], - ref[(i + startVec) * dim + j]); - } - } - } - - // Test reconstruct_batch - if (false) { - auto reconstructKeys = std::vector{1, 3, 5}; - auto reconstructVecs = - std::vector(reconstructKeys.size() * dim); - - gpuIndex.reconstruct_batch( - reconstructKeys.size(), - reconstructKeys.data(), - reconstructVecs.data()); - - auto& ref = useFloat16 ? vecs16 : vecs; - - for (int i = 0; i < reconstructKeys.size(); ++i) { - for (int j = 0; j < dim; ++j) { - EXPECT_EQ( - reconstructVecs[i * dim + j], - ref[reconstructKeys[i] * dim + j]); - } - } - } - } + // Construct on a random device to test multi-device, if we have + // multiple devices + int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + int dim = 32; + int numVecs = 100; + auto vecs = faiss::gpu::randVecs(numVecs, dim); + auto vecs16 = faiss::gpu::roundToHalf(vecs); + + for (bool useFloat16 : {false, true}) { + faiss::gpu::GpuIndexFlatConfig config; + config.device = device; + config.useFloat16 = useFloat16; + config.use_raft = use_raft; + + faiss::gpu::GpuIndexFlat gpuIndex( + &res, dim, faiss::MetricType::METRIC_L2, config); + + gpuIndex.add(numVecs, vecs.data()); + + // Test reconstruct + { + auto reconstructVecs = std::vector(dim); + gpuIndex.reconstruct(15, reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < dim; ++i) { + EXPECT_EQ(reconstructVecs[i], ref[15 * dim + i]); + } + } + + // Test reconstruct_n + if (false) { + auto reconstructVecs = std::vector((numVecs - 1) * dim); + + int startVec = 5; + int endVec = numVecs - 1; + int numReconstructVec = endVec - startVec + 1; + + gpuIndex.reconstruct_n( + startVec, numReconstructVec, reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < numReconstructVec; ++i) { + for (int j = 0; j < dim; ++j) { + EXPECT_EQ( + reconstructVecs[i * dim + j], + ref[(i + startVec) * dim + j]); + } + } + } + + // Test reconstruct_batch + if (false) { + auto reconstructKeys = std::vector{1, 3, 5}; + auto reconstructVecs = + std::vector(reconstructKeys.size() * dim); + + gpuIndex.reconstruct_batch( + reconstructKeys.size(), + reconstructKeys.data(), + reconstructVecs.data()); + + auto& ref = useFloat16 ? vecs16 : vecs; + + for (int i = 0; i < reconstructKeys.size(); ++i) { + for (int j = 0; j < dim; ++j) { + EXPECT_EQ( + reconstructVecs[i * dim + j], + ref[reconstructKeys[i] * dim + j]); + } + } + } + } } TEST(TestGpuIndexFlat, Reconstruct) { - testReconstruct(false); + testReconstruct(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, Reconstruct) { - testReconstruct(true); + testReconstruct(true); } #endif @@ -750,20 +750,20 @@ void testSearchAndReconstruct(bool use_raft) { } } TEST(TestGpuIndexFlat, SearchAndReconstruct) { - testSearchAndReconstruct(false); + testSearchAndReconstruct(false); } #if defined USE_NVIDIA_RAFT TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) { - testSearchAndReconstruct(true); + testSearchAndReconstruct(true); } #endif int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); + testing::InitGoogleTest(&argc, argv); - // just run with a fixed test seed - faiss::gpu::setTestSeed(100); + // just run with a fixed test seed + faiss::gpu::setTestSeed(100); - return RUN_ALL_TESTS(); + return RUN_ALL_TESTS(); } \ No newline at end of file From 5c0592ec338bee6ff194b50b86402851189d7c90 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 21 Aug 2023 16:06:57 -0700 Subject: [PATCH 82/87] remove debug statements --- faiss/gpu/GpuIndex.cu | 1 - faiss/gpu/impl/IVFBase.cu | 2 -- 2 files changed, 3 deletions(-) diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 749bec221f..89952b1121 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -77,7 +77,6 @@ int GpuIndex::getDevice() const { } void GpuIndex::copyFrom(const faiss::Index* index) { - printf("inside gpuindex copyFrom\n"); d = index->d; metric_type = index->metric_type; metric_arg = index->metric_arg; diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu index 6aef83ef3f..890d489440 100644 --- a/faiss/gpu/impl/IVFBase.cu +++ b/faiss/gpu/impl/IVFBase.cu @@ -106,7 +106,6 @@ void IVFBase::reserveMemory(idx_t numVecs) { } void IVFBase::reset() { - printf("inside ivfbase::reset\n"); auto stream = resources_->getDefaultStreamCurrentDevice(); deviceListData_.clear(); @@ -324,7 +323,6 @@ std::vector IVFBase::getListVectorData(idx_t listId, bool gpuFormat) } void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) { - printf("Inside IVFBase's copyInvertedListsFrom\n"); idx_t nlist = ivf ? ivf->nlist : 0; for (idx_t i = 0; i < nlist; ++i) { addEncodedVectorsToList_( From 7618b44951b2e0818495d85436e36d6b43f32520 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 23 Aug 2023 09:40:21 -0700 Subject: [PATCH 83/87] LargeBatch test added and now passing --- build.sh | 5 +++++ faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/build.sh b/build.sh index 6a353379f8..bb9985ce25 100755 --- a/build.sh +++ b/build.sh @@ -50,4 +50,9 @@ cmake \ ${EXTRA_CMAKE_ARGS} \ ../ + +# make -C build -j12 faiss cmake --build . -j12 +# make -C build -j12 swigfaiss +# (cd build/faiss/python && python setup.py install) + diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 7f2ae81196..821fbe1159 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -351,6 +351,11 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) { opt.dim = 3; opt.numQuery = 100000; queryTest(opt, faiss::METRIC_L2, false); + +#if defined USE_NVIDIA_RAFT + opt.use_raft = true; + queryTest(opt, faiss::METRIC_L2, false); +#endif } // float16 coarse quantizer From bd5a217892541417da89f5acd4c1332b0865c307 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 23 Aug 2023 13:43:51 -0700 Subject: [PATCH 84/87] final update to gtests --- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 76 +++++++++++++++++++++----- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 821fbe1159..b92322d9a8 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -476,6 +476,14 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { faiss::gpu::StandardGpuResources res; res.noTempMemory(); + // Construct a positive test set + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + + // Put all vecs on positive size + for (auto& f : queryVecs) { + f = std::abs(f); + } + faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; @@ -485,14 +493,6 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { gpuIndex.copyFrom(&cpuIndex); gpuIndex.nprobe = opt.nprobe; - // Construct a positive test set - auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); - - // Put all vecs on positive size - for (auto& f : queryVecs) { - f = std::abs(f); - } - bool compFloat16 = false; faiss::gpu::compareIndices( queryVecs, @@ -508,6 +508,30 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { // in fp16. Figure out another way to test compFloat16 ? 0.99f : 0.1f, compFloat16 ? 0.65f : 0.015f); + +#if defined USE_NVIDIA_RAFT + config.use_raft = true; + + faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); + raftGpuIndex.copyFrom(&cpuIndex); + raftGpuIndex.nprobe = opt.nprobe; + + faiss::gpu::compareIndices( + queryVecs, + cpuIndex, + raftGpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + compFloat16 ? kF16MaxRelErr : kF32MaxRelErr, + // FIXME: the fp16 bounds are + // useless when math (the accumulator) is + // in fp16. Figure out another way to test + compFloat16 ? 0.99f : 0.1f, + compFloat16 ? 0.65f : 0.015f); +#endif } // @@ -523,6 +547,13 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { faiss::gpu::StandardGpuResources res; res.noTempMemory(); + int numQuery = 10; + std::vector nans( + numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + + std::vector distances(numQuery * opt.k, 0); + std::vector indices(numQuery * opt.k, 0); + faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; @@ -535,14 +566,30 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { gpuIndex.train(opt.numTrain, trainVecs.data()); gpuIndex.add(opt.numAdd, addVecs.data()); - int numQuery = 10; - std::vector nans( - numQuery * opt.dim, std::numeric_limits::quiet_NaN()); + gpuIndex.search( + numQuery, nans.data(), opt.k, distances.data(), indices.data()); - std::vector distances(numQuery * opt.k, 0); - std::vector indices(numQuery * opt.k, 0); + for (int q = 0; q < numQuery; ++q) { + for (int k = 0; k < opt.k; ++k) { + EXPECT_EQ(indices[q * opt.k + k], -1); + EXPECT_EQ( + distances[q * opt.k + k], + std::numeric_limits::max()); + } + } - gpuIndex.search( +#if defined USE_NVIDIA_RAFT + config.use_raft = true; + std::fill(distances.begin(), distances.end(), 0); + std::fill(indices.begin(), indices.end(), 0); + faiss::gpu::GpuIndexIVFFlat raftGpuIndex( + &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); + raftGpuIndex.nprobe = opt.nprobe; + + raftGpuIndex.train(opt.numTrain, trainVecs.data()); + raftGpuIndex.add(opt.numAdd, addVecs.data()); + + raftGpuIndex.search( numQuery, nans.data(), opt.k, distances.data(), indices.data()); for (int q = 0; q < numQuery; ++q) { @@ -553,6 +600,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { std::numeric_limits::max()); } } +#endif } TEST(TestGpuIndexIVFFlat, AddNaN) { From 2022a1474afca28b9eeba37d54ca7178c0b4bdd3 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 6 Sep 2023 10:03:36 -0700 Subject: [PATCH 85/87] Pull latest --- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index b92322d9a8..d042292aef 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -202,7 +202,6 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); - printf("opt.numCentroids %d", opt.numCentroids); gpuIndex.train(opt.numTrain, trainVecs.data()); gpuIndex.add(opt.numAdd, addVecs.data()); gpuIndex.nprobe = opt.nprobe; From e441ce5772bb905be7cd64bb6e38eb111277010f Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 6 Sep 2023 13:17:04 -0700 Subject: [PATCH 86/87] IndicesOptions assertion --- faiss/gpu/impl/RaftIVFFlat.cu | 7 +++++-- faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 21 ++++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 3861a2283c..2c6afb795c 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -73,6 +73,9 @@ RaftIVFFlat::RaftIVFFlat( interleavedLayout, indicesOptions, space) { + FAISS_THROW_IF_NOT_MSG( + indicesOptions == INDICES_64_BIT, + "only INDICES_64_BIT is supported for RAFT index"); reset(); } @@ -159,6 +162,8 @@ idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { + /// TODO: We probably don't want to ignore the coarse quantizer here + idx_t n_rows = vecs.getSize(0); const raft::device_resources& raft_handle = @@ -207,8 +212,6 @@ idx_t RaftIVFFlat::addVectors( raft::make_const_mdspan(gather_indices.view())); } - /// TODO: We probably don't want to ignore the coarse quantizer here - FAISS_ASSERT(raft_knn_index.has_value()); raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( raft_handle, diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index d042292aef..8af86c2876 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -30,6 +30,7 @@ #include #include #include +#include "faiss/gpu/GpuIndicesOptions.h" // FIXME: figure out a better way to test fp16 constexpr float kF16MaxRelErr = 0.3f; @@ -160,7 +161,7 @@ void addTest( faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = opt.indicesOpt; + config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft; @@ -196,7 +197,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = opt.indicesOpt; + config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft; @@ -256,7 +257,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = opt.indicesOpt; + config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft; @@ -331,6 +332,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif } @@ -341,6 +343,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif } @@ -353,6 +356,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif } @@ -365,6 +369,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, true); #endif } @@ -375,6 +380,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, true); #endif } @@ -391,6 +397,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif } @@ -402,6 +409,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif } @@ -413,6 +421,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif } @@ -424,6 +433,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) { #if defined USE_NVIDIA_RAFT opt.use_raft = true; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif } @@ -510,6 +520,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { #if defined USE_NVIDIA_RAFT config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); @@ -579,6 +590,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { #if defined USE_NVIDIA_RAFT config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; std::fill(distances.begin(), distances.end(), 0); std::fill(indices.begin(), indices.end(), 0); faiss::gpu::GpuIndexIVFFlat raftGpuIndex( @@ -647,6 +659,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { #if defined USE_NVIDIA_RAFT config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); raftGpuIndex.nprobe = opt.nprobe; @@ -723,6 +736,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { #if defined USE_NVIDIA_RAFT config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); raftGpuIndex.copyFrom(&cpuIndex); @@ -800,6 +814,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { #if defined USE_NVIDIA_RAFT config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; faiss::gpu::GpuIndexIVFFlat raftGpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); raftGpuIndex.train(numTrain, trainVecs.data()); From a0457bdfbed800081a0b39c9e8245378ae1c5d2d Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 7 Sep 2023 08:44:54 -0700 Subject: [PATCH 87/87] checks passing --- faiss/gpu/GpuIndexIVF.h | 6 ------ faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 9 ++++++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index efd8bfc755..a9f092d35b 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -92,12 +92,6 @@ class GpuIndexIVF : public GpuIndex, public IndexIVFInterface { /// debugging purposes. virtual std::vector getListIndices(idx_t listId) const; - /// Sets the number of list probes per query - void setNumProbes(int nprobe); - - /// Returns our current number of list probes per query - int getNumProbes() const; - void search_preassigned( idx_t n, const float* x, diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 8af86c2876..9fb88e2687 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -161,7 +161,8 @@ void addTest( faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + config.indicesOptions = + use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft; @@ -197,7 +198,8 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + config.indicesOptions = + use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft; @@ -257,7 +259,8 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; - config.indicesOptions = use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + config.indicesOptions = + use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.use_raft = use_raft;