diff --git a/faiss/Index.h b/faiss/Index.h index 2474f08be4..95af05df74 100644 --- a/faiss/Index.h +++ b/faiss/Index.h @@ -11,6 +11,8 @@ #define FAISS_INDEX_H #include +#include + #include #include #include @@ -56,6 +58,23 @@ struct IDSelector; struct RangeSearchResult; struct DistanceComputer; +enum NumericType { + Float32, + Float16, +}; + +inline size_t get_numeric_type_size(NumericType numeric_type) { + switch (numeric_type) { + case NumericType::Float32: + return 4; + case NumericType::Float16: + return 2; + default: + FAISS_THROW_MSG( + "Unknown Numeric Type. Only supports Float32, Float16"); + } +} + /** Parent class for the optional search paramenters. * * Sub-classes with additional search parameters should inherit this class. @@ -107,6 +126,14 @@ struct Index { */ virtual void train(idx_t n, const float* x); + virtual void train(idx_t n, const void* x, NumericType numeric_type) { + if (numeric_type == NumericType::Float32) { + train(n, static_cast(x)); + } else { + FAISS_THROW_MSG("Index::train: unsupported numeric type"); + } + } + /** Add n vectors of dimension d to the index. * * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1 @@ -117,6 +144,14 @@ struct Index { */ virtual void add(idx_t n, const float* x) = 0; + virtual void add(idx_t n, const void* x, NumericType numeric_type) { + if (numeric_type == NumericType::Float32) { + add(n, static_cast(x)); + } else { + FAISS_THROW_MSG("Index::add: unsupported numeric type"); + } + } + /** Same as add, but stores xids instead of sequential ids. * * The default implementation fails with an assertion, as it is @@ -127,6 +162,17 @@ struct Index { * @param xids if non-null, ids to store for the vectors (size n) */ virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids); + virtual void add_with_ids( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* xids) { + if (numeric_type == NumericType::Float32) { + add_with_ids(n, static_cast(x), xids); + } else { + FAISS_THROW_MSG("Index::add_with_ids: unsupported numeric type"); + } + } /** query n vectors of dimension d to the index. * @@ -147,6 +193,26 @@ struct Index { idx_t* labels, const SearchParameters* params = nullptr) const = 0; + virtual void search( + idx_t n, + const void* x, + NumericType numeric_type, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params = nullptr) const { + if (numeric_type == NumericType::Float32) { + search(n, + static_cast(x), + k, + distances, + labels, + params); + } else { + FAISS_THROW_MSG("Index::search: unsupported numeric type"); + } + } + /** query n vectors of dimension d to the index. * * return all vectors with distance < radius. Note that many diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index 5983e9d831..1ee15f4484 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -19,6 +19,7 @@ #include #include +#include "faiss/Index.h" #include #include @@ -893,15 +894,31 @@ IndexHNSWCagra::IndexHNSWCagra() { is_trained = true; } -IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric) - : IndexHNSW( - (metric == METRIC_L2) - ? static_cast(new IndexFlatL2(d)) - : static_cast(new IndexFlatIP(d)), - M) { +IndexHNSWCagra::IndexHNSWCagra( + int d, + int M, + MetricType metric, + NumericType numeric_type) + : IndexHNSW(d, M, metric) { FAISS_THROW_IF_NOT_MSG( ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)), "unsupported metric type for IndexHNSWCagra"); + numeric_type_ = numeric_type; + if (numeric_type == NumericType::Float32) { + // Use flat storage with full precision for fp32 + storage = (metric == METRIC_L2) + ? static_cast(new IndexFlatL2(d)) + : static_cast(new IndexFlatIP(d)); + } else if (numeric_type == NumericType::Float16) { + auto qtype = ScalarQuantizer::QT_fp16; + storage = new IndexScalarQuantizer(d, qtype, metric); + } else { + FAISS_THROW_MSG( + "Unsupported numeric_type: only F16 and F32 are supported for IndexHNSWCagra"); + } + + metric_arg = storage->metric_arg; + own_fields = true; is_trained = true; init_level0 = true; @@ -967,4 +984,12 @@ void IndexHNSWCagra::search( } } +faiss::NumericType IndexHNSWCagra::get_numeric_type() const { + return numeric_type_; +} + +void IndexHNSWCagra::set_numeric_type(faiss::NumericType numeric_type) { + numeric_type_ = numeric_type; +} + } // namespace faiss diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h index 2d983b3c16..c6e80df462 100644 --- a/faiss/IndexHNSW.h +++ b/faiss/IndexHNSW.h @@ -10,6 +10,7 @@ #pragma once #include +#include "faiss/Index.h" #include #include @@ -170,7 +171,11 @@ struct IndexHNSW2Level : IndexHNSW { struct IndexHNSWCagra : IndexHNSW { IndexHNSWCagra(); - IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2); + IndexHNSWCagra( + int d, + int M, + MetricType metric = METRIC_L2, + NumericType numeric_type = NumericType::Float32); /// When set to true, the index is immutable. /// This option is used to copy the knn graph from GpuIndexCagra @@ -195,6 +200,10 @@ struct IndexHNSWCagra : IndexHNSW { float* distances, idx_t* labels, const SearchParameters* params = nullptr) const override; + + faiss::NumericType get_numeric_type() const; + void set_numeric_type(faiss::NumericType numeric_type); + NumericType numeric_type_; }; } // namespace faiss diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp index 575ee2e0a5..faea7519d9 100644 --- a/faiss/gpu/GpuCloner.cpp +++ b/faiss/gpu/GpuCloner.cpp @@ -95,6 +95,9 @@ Index* ToCPUCloner::clone_Index(const Index* index) { #if defined USE_NVIDIA_CUVS else if (auto icg = dynamic_cast(index)) { IndexHNSWCagra* res = new IndexHNSWCagra(); + if (icg->get_numeric_type() == faiss::NumericType::Float16) { + res->base_level_only = true; + } icg->copyTo(res); return res; } @@ -236,7 +239,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) { config.device = device; GpuIndexCagra* res = new GpuIndexCagra(provider, icg->d, icg->metric_type, config); - res->copyFrom(icg); + res->copyFrom(icg, icg->get_numeric_type()); return res; } #endif diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 033cb9aaa9..fb157c84a7 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -109,12 +110,20 @@ size_t GpuIndex::getMinPagingSize() const { return minPagedSize_; } +void GpuIndex::add(idx_t n, const void* x, NumericType numeric_type) { + add_with_ids(n, x, numeric_type, nullptr); +} + void GpuIndex::add(idx_t n, const float* x) { // Pass to add_with_ids - add_with_ids(n, x, nullptr); + add(n, x, NumericType::Float32); } -void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) { +void GpuIndex::add_with_ids( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids) { DeviceScope scope(config_.device); FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained"); @@ -134,18 +143,33 @@ void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) { } } - addPaged_(n, x, ids ? ids : generatedIds.data()); + addPaged_(n, x, numeric_type, ids ? ids : generatedIds.data()); } -void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) { - if (n > 0) { - idx_t totalSize = n * this->d * sizeof(float); +void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) { + add_with_ids(n, static_cast(x), NumericType::Float32, ids); +} + +void GpuIndex::addPaged_( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids) { + if (n <= 0) { + return; + } + + auto dispatch = [&](auto dummy_type) { + using data_t = decltype(dummy_type); + const data_t* typed_x = reinterpret_cast(x); + + idx_t totalSize = n * this->d * sizeof(data_t); if (!should_use_cuvs(config_) && (totalSize > kAddPageSize || n > kAddVecSize)) { // How many vectors fit into kAddPageSize? idx_t maxNumVecsForPageSize = - kAddPageSize / (this->d * sizeof(float)); + kAddPageSize / (this->d * sizeof(data_t)); // Always add at least 1 vector, if we have huge vectors maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, idx_t(1)); @@ -155,16 +179,35 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) { for (idx_t i = 0; i < n; i += tileSize) { auto curNum = std::min(tileSize, n - i); - - addPage_(curNum, x + i * this->d, ids ? ids + i : nullptr); + addPage_( + curNum, + static_cast(typed_x + i * this->d), + numeric_type, + ids ? ids + i : nullptr); } } else { - addPage_(n, x, ids); + addPage_(n, static_cast(typed_x), numeric_type, ids); } + }; + + if (numeric_type == NumericType::Float32) { + dispatch(float{}); + } else if (numeric_type == NumericType::Float16) { + dispatch(half{}); + } else { + FAISS_THROW_MSG("GpuIndex::addPaged_: Unsupported numeric type"); } } -void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) { +void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) { + addPaged_(n, static_cast(x), NumericType::Float32, ids); +} + +void GpuIndex::addPage_( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids) { // At this point, `x` can be resident on CPU or GPU, and `ids` may be // resident on CPU, GPU or may be null. // @@ -172,27 +215,51 @@ void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) { // GPU. auto stream = resources_->getDefaultStreamCurrentDevice(); - auto vecs = toDeviceTemporary( - resources_.get(), - config_.device, - const_cast(x), - stream, - {n, this->d}); + auto dispatch = [&](auto dummy_type) { + using data_t = decltype(dummy_type); - if (ids) { - auto indices = toDeviceTemporary( + auto vecs = toDeviceTemporary( resources_.get(), config_.device, - const_cast(ids), + const_cast(reinterpret_cast(x)), stream, - {n}); + {n, this->d}); + + if (ids) { + auto indices = toDeviceTemporary( + resources_.get(), + config_.device, + const_cast(ids), + stream, + {n}); + + addImpl_( + n, + static_cast(vecs.data()), + numeric_type, + ids ? indices.data() : nullptr); + } else { + addImpl_( + n, + static_cast(vecs.data()), + numeric_type, + nullptr); + } + }; - addImpl_(n, vecs.data(), ids ? indices.data() : nullptr); + if (numeric_type == NumericType::Float32) { + dispatch(float{}); + } else if (numeric_type == NumericType::Float16) { + dispatch(half{}); } else { - addImpl_(n, vecs.data(), nullptr); + FAISS_THROW_MSG("GpuIndex::addPage_: Unsupported numeric type"); } } +void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) { + addPage_(n, static_cast(x), NumericType::Float32, ids); +} + void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const { DeviceScope scope(config_.device); FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained"); @@ -212,7 +279,8 @@ void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const { void GpuIndex::search( idx_t n, - const float* x, + const void* x, + NumericType numeric_type, idx_t k, float* distances, idx_t* labels, @@ -253,17 +321,31 @@ void GpuIndex::search( // -> GPU. // Currently, we don't handle the case where the output data won't // fit on the GPU (e.g., n * k is too large for the GPU memory). - size_t dataSize = (size_t)n * this->d * sizeof(float); + size_t dataSize = + (size_t)n * this->d * get_numeric_type_size(numeric_type); if (dataSize >= minPagedSize_) { searchFromCpuPaged_( - n, x, k, outDistances.data(), outLabels.data(), params); + n, + x, + numeric_type, + k, + outDistances.data(), + outLabels.data(), + params); usePaged = true; } } if (!usePaged) { - searchNonPaged_(n, x, k, outDistances.data(), outLabels.data(), params); + searchNonPaged_( + n, + x, + numeric_type, + k, + outDistances.data(), + outLabels.data(), + params); } // Copy back if necessary @@ -271,6 +353,22 @@ void GpuIndex::search( fromDevice(outLabels, labels, stream); } +void GpuIndex::search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params) const { + search(n, + static_cast(x), + NumericType::Float32, + k, + distances, + labels, + params); +} + void GpuIndex::search_and_reconstruct( idx_t n, const float* x, @@ -285,7 +383,8 @@ void GpuIndex::search_and_reconstruct( void GpuIndex::searchNonPaged_( idx_t n, - const float* x, + const void* x, + NumericType numeric_type, int k, float* outDistancesData, idx_t* outIndicesData, @@ -294,49 +393,103 @@ void GpuIndex::searchNonPaged_( // Make sure arguments are on the device we desire; use temporary // memory allocations to move it if necessary - auto vecs = toDeviceTemporary( - resources_.get(), - config_.device, - const_cast(x), - stream, - {n, this->d}); - - searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, params); + if (numeric_type == NumericType::Float32) { + auto vecs = toDeviceTemporary( + resources_.get(), + config_.device, + const_cast(static_cast(x)), + stream, + {n, this->d}); + + searchImpl_( + n, vecs.data(), k, outDistancesData, outIndicesData, params); + } else if (numeric_type == NumericType::Float16) { + auto vecs = toDeviceTemporary( + resources_.get(), + config_.device, + const_cast(static_cast(x)), + stream, + {n, this->d}); + + searchImpl_( + n, + static_cast(vecs.data()), + numeric_type, + k, + outDistancesData, + outIndicesData, + params); + } else { + FAISS_THROW_MSG("GpuIndex::search: Unsupported numeric type"); + } } -void GpuIndex::searchFromCpuPaged_( +void GpuIndex::searchNonPaged_( idx_t n, const float* x, int k, float* outDistancesData, idx_t* outIndicesData, const SearchParameters* params) const { + searchNonPaged_( + n, + static_cast(x), + NumericType::Float32, + k, + outDistancesData, + outIndicesData, + params); +} + +void GpuIndex::searchFromCpuPaged_( + idx_t n, + const void* x, + NumericType numeric_type, + int k, + float* outDistancesData, + idx_t* outIndicesData, + const SearchParameters* params) const { Tensor outDistances(outDistancesData, {n, k}); Tensor outIndices(outIndicesData, {n, k}); // Is pinned memory available? auto pinnedAlloc = resources_->getPinnedMemory(); idx_t pageSizeInVecs = - ((pinnedAlloc.second / 2) / (sizeof(float) * this->d)); + ((pinnedAlloc.second / 2) / + (get_numeric_type_size(numeric_type) * this->d)); if (!pinnedAlloc.first || pageSizeInVecs < 1) { // Just page without overlapping copy with compute idx_t batchSize = utils::nextHighestPowerOf2( - (kNonPinnedPageSize / (sizeof(float) * this->d))); + (kNonPinnedPageSize / + (get_numeric_type_size(numeric_type) * this->d))); for (idx_t cur = 0; cur < n; cur += batchSize) { auto num = std::min(batchSize, n - cur); auto outDistancesSlice = outDistances.narrowOutermost(cur, num); auto outIndicesSlice = outIndices.narrowOutermost(cur, num); - - searchNonPaged_( - num, - x + cur * this->d, - k, - outDistancesSlice.data(), - outIndicesSlice.data(), - params); + if (numeric_type == NumericType::Float32) { + searchNonPaged_( + num, + static_cast( + static_cast(x) + cur * this->d), + numeric_type, + k, + outDistancesSlice.data(), + outIndicesSlice.data(), + params); + } else if (numeric_type == NumericType::Float16) { + searchNonPaged_( + num, + static_cast( + static_cast(x) + cur * this->d), + numeric_type, + k, + outDistancesSlice.data(), + outIndicesSlice.data(), + params); + } } return; @@ -359,127 +512,162 @@ void GpuIndex::searchFromCpuPaged_( auto defaultStream = resources_->getDefaultStream(config_.device); auto copyStream = resources_->getAsyncCopyStream(config_.device); - float* bufPinnedA = (float*)pinnedAlloc.first; - float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d; - float* bufPinned[2] = {bufPinnedA, bufPinnedB}; - - // Reserve space on the GPU for the destination of the pinned buffer - // copy - DeviceTensor bufGpuA( - resources_.get(), - makeTempAlloc(AllocType::Other, defaultStream), - {pageSizeInVecs, this->d}); - DeviceTensor bufGpuB( - resources_.get(), - makeTempAlloc(AllocType::Other, defaultStream), - {pageSizeInVecs, this->d}); - DeviceTensor* bufGpus[2] = {&bufGpuA, &bufGpuB}; - - // Copy completion events for the pinned buffers - std::unique_ptr eventPinnedCopyDone[2]; - - // Execute completion events for the GPU buffers - std::unique_ptr eventGpuExecuteDone[2]; - - // All offsets are in terms of number of vectors - - // Current start offset for buffer 1 - idx_t cur1 = 0; - idx_t cur1BufIndex = 0; - - // Current start offset for buffer 2 - idx_t cur2 = -1; - idx_t cur2BufIndex = 0; - - // Current start offset for buffer 3 - idx_t cur3 = -1; - idx_t cur3BufIndex = 0; - - while (cur3 < n) { - // Start async pinned -> GPU copy first (buf 2) - if (cur2 != -1 && cur2 < n) { - // Copy pinned to GPU - auto numToCopy = std::min(pageSizeInVecs, n - cur2); - - // Make sure any previous execution has completed before continuing - auto& eventPrev = eventGpuExecuteDone[cur2BufIndex]; - if (eventPrev.get()) { - eventPrev->streamWaitOnEvent(copyStream); - } - - CUDA_VERIFY(cudaMemcpyAsync( - bufGpus[cur2BufIndex]->data(), - bufPinned[cur2BufIndex], - numToCopy * this->d * sizeof(float), - cudaMemcpyHostToDevice, - copyStream)); - - // Mark a completion event in this stream - eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream)); - - // We pick up from here - cur3 = cur2; - cur2 += numToCopy; - cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0; - } - - if (cur3 != idx_t(-1) && cur3 < n) { - // Process on GPU - auto numToProcess = std::min(pageSizeInVecs, n - cur3); - - // Make sure the previous copy has completed before continuing - auto& eventPrev = eventPinnedCopyDone[cur3BufIndex]; - FAISS_ASSERT(eventPrev.get()); + auto dispatch = [&](auto dummy_type) { + using data_t = decltype(dummy_type); - eventPrev->streamWaitOnEvent(defaultStream); + data_t* bufPinnedA = (data_t*)pinnedAlloc.first; + data_t* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d; + data_t* bufPinned[2] = {bufPinnedA, bufPinnedB}; - // Create tensor wrappers - // DeviceTensor input(bufGpus[cur3BufIndex]->data(), - // {numToProcess, this->d}); - auto outDistancesSlice = - outDistances.narrowOutermost(cur3, numToProcess); - auto outIndicesSlice = - outIndices.narrowOutermost(cur3, numToProcess); - - searchImpl_( - numToProcess, - bufGpus[cur3BufIndex]->data(), - k, - outDistancesSlice.data(), - outIndicesSlice.data(), - params); - - // Create completion event - eventGpuExecuteDone[cur3BufIndex].reset( - new CudaEvent(defaultStream)); - - // We pick up from here - cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0; - cur3 += numToProcess; - } - - if (cur1 < n) { - // Copy CPU mem to CPU pinned - auto numToCopy = std::min(pageSizeInVecs, n - cur1); - - // Make sure any previous copy has completed before continuing - auto& eventPrev = eventPinnedCopyDone[cur1BufIndex]; - if (eventPrev.get()) { - eventPrev->cpuWaitOnEvent(); + // Reserve space on the GPU for the destination of the pinned buffer + // copy + DeviceTensor bufGpuA( + resources_.get(), + makeTempAlloc(AllocType::Other, defaultStream), + {pageSizeInVecs, this->d}); + DeviceTensor bufGpuB( + resources_.get(), + makeTempAlloc(AllocType::Other, defaultStream), + {pageSizeInVecs, this->d}); + DeviceTensor* bufGpus[2] = {&bufGpuA, &bufGpuB}; + + // Copy completion events for the pinned buffers + std::unique_ptr eventPinnedCopyDone[2]; + + // Execute completion events for the GPU buffers + std::unique_ptr eventGpuExecuteDone[2]; + + // All offsets are in terms of number of vectors + + // Current start offset for buffer 1 + idx_t cur1 = 0; + idx_t cur1BufIndex = 0; + + // Current start offset for buffer 2 + idx_t cur2 = -1; + idx_t cur2BufIndex = 0; + + // Current start offset for buffer 3 + idx_t cur3 = -1; + idx_t cur3BufIndex = 0; + + while (cur3 < n) { + // Start async pinned -> GPU copy first (buf 2) + if (cur2 != -1 && cur2 < n) { + // Copy pinned to GPU + auto numToCopy = std::min(pageSizeInVecs, n - cur2); + + // Make sure any previous execution has completed before + // continuing + auto& eventPrev = eventGpuExecuteDone[cur2BufIndex]; + if (eventPrev.get()) { + eventPrev->streamWaitOnEvent(copyStream); + } + + CUDA_VERIFY(cudaMemcpyAsync( + bufGpus[cur2BufIndex]->data(), + bufPinned[cur2BufIndex], + numToCopy * this->d * sizeof(data_t), + cudaMemcpyHostToDevice, + copyStream)); + + // Mark a completion event in this stream + eventPinnedCopyDone[cur2BufIndex].reset( + new CudaEvent(copyStream)); + + // We pick up from here + cur3 = cur2; + cur2 += numToCopy; + cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0; } - memcpy(bufPinned[cur1BufIndex], - x + cur1 * this->d, - numToCopy * this->d * sizeof(float)); + if (cur3 != idx_t(-1) && cur3 < n) { + // Process on GPU + auto numToProcess = std::min(pageSizeInVecs, n - cur3); + + // Make sure the previous copy has completed before continuing + auto& eventPrev = eventPinnedCopyDone[cur3BufIndex]; + FAISS_ASSERT(eventPrev.get()); + + eventPrev->streamWaitOnEvent(defaultStream); + + // Create tensor wrappers + // DeviceTensor + // input(bufGpus[cur3BufIndex]->data(), + // {numToProcess, this->d}); + auto outDistancesSlice = + outDistances.narrowOutermost(cur3, numToProcess); + auto outIndicesSlice = + outIndices.narrowOutermost(cur3, numToProcess); + + searchImpl_( + numToProcess, + static_cast(bufGpus[cur3BufIndex]->data()), + numeric_type, + k, + outDistancesSlice.data(), + outIndicesSlice.data(), + params); + + // Create completion event + eventGpuExecuteDone[cur3BufIndex].reset( + new CudaEvent(defaultStream)); + + // We pick up from here + cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0; + cur3 += numToProcess; + } - // We pick up from here - cur2 = cur1; - cur1 += numToCopy; - cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0; + if (cur1 < n) { + // Copy CPU mem to CPU pinned + auto numToCopy = std::min(pageSizeInVecs, n - cur1); + + // Make sure any previous copy has completed before continuing + auto& eventPrev = eventPinnedCopyDone[cur1BufIndex]; + if (eventPrev.get()) { + eventPrev->cpuWaitOnEvent(); + } + + memcpy(bufPinned[cur1BufIndex], + static_cast(x) + cur1 * this->d, + numToCopy * this->d * + get_numeric_type_size(numeric_type)); + + // We pick up from here + cur2 = cur1; + cur1 += numToCopy; + cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0; + } } + }; + + if (numeric_type == NumericType::Float32) { + dispatch(float{}); + } else if (numeric_type == NumericType::Float16) { + dispatch(half{}); + } else { + FAISS_THROW_MSG( + "GpuIndex::searchFromCpuPaged_: Unsupported numeric type"); } } +void GpuIndex::searchFromCpuPaged_( + idx_t n, + const float* x, + int k, + float* outDistancesData, + idx_t* outIndicesData, + const SearchParameters* params) const { + searchFromCpuPaged_( + n, + static_cast(x), + NumericType::Float32, + k, + outDistancesData, + outIndicesData, + params); +} + void GpuIndex::compute_residual(const float* x, float* residual, idx_t key) const { FAISS_THROW_MSG("compute_residual not implemented for this type of index"); diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h index 33fd37158e..bdeb362d31 100644 --- a/faiss/gpu/GpuIndex.h +++ b/faiss/gpu/GpuIndex.h @@ -77,11 +77,17 @@ class GpuIndex : public faiss::Index { /// as needed /// Handles paged adds if the add set is too large; calls addInternal_ void add(idx_t, const float* x) override; + void add(idx_t, const void* x, NumericType numeric_type) override; /// `x` and `ids` can be resident on the CPU or any GPU; copies are /// performed as needed /// Handles paged adds if the add set is too large; calls addInternal_ void add_with_ids(idx_t n, const float* x, const idx_t* ids) override; + void add_with_ids( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* xids) override; /// `x` and `labels` can be resident on the CPU or any GPU; copies are /// performed as needed @@ -97,6 +103,14 @@ class GpuIndex : public faiss::Index { float* distances, idx_t* labels, const SearchParameters* params = nullptr) const override; + void search( + idx_t n, + const void* x, + NumericType numeric_type, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params = nullptr) const override; /// `x`, `distances` and `labels` and `recons` can be resident on the CPU or /// any GPU; copies are performed as needed @@ -125,9 +139,23 @@ class GpuIndex : public faiss::Index { protected: /// Copy what we need from the CPU equivalent void copyFrom(const faiss::Index* index); + void copyFrom(const faiss::Index* index, NumericType numeric_type) { + if (numeric_type == NumericType::Float32) { + copyFrom(index, NumericType::Float32); + } else { + FAISS_THROW_MSG("GpuIndex::copyFrom: unsupported numeric type"); + } + } /// Copy what we have to the CPU equivalent void copyTo(faiss::Index* index) const; + void copyTo(const faiss::Index* index, NumericType numeric_type) { + if (numeric_type == NumericType::Float32) { + copyTo(index, NumericType::Float32); + } else { + FAISS_THROW_MSG("GpuIndex::copyTo: unsupported numeric type"); + } + } /// Does addImpl_ require IDs? If so, and no IDs are provided, we will /// generate them sequentially based on the order in which the IDs are added @@ -137,6 +165,18 @@ class GpuIndex : public faiss::Index { /// All data is guaranteed to be resident on our device virtual void addImpl_(idx_t n, const float* x, const idx_t* ids) = 0; + virtual void addImpl_( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids) { + if (numeric_type == NumericType::Float32) { + addImpl_(n, static_cast(x), ids); + } else { + FAISS_THROW_MSG("GpuIndex::addImpl_: unsupported numeric type"); + } + }; + /// Overridden to actually perform the search /// All data is guaranteed to be resident on our device virtual void searchImpl_( @@ -147,13 +187,44 @@ class GpuIndex : public faiss::Index { idx_t* labels, const SearchParameters* params) const = 0; + virtual void searchImpl_( + idx_t n, + const void* x, + NumericType numeric_type, + int k, + float* distances, + idx_t* labels, + const SearchParameters* params) const { + if (numeric_type == NumericType::Float32) { + searchImpl_( + n, + static_cast(x), + k, + distances, + labels, + params); + } else { + FAISS_THROW_MSG("GpuIndex::searchImpl_: unsupported numeric type"); + } + } + private: /// Handles paged adds if the add set is too large, passes to /// addImpl_ to actually perform the add for the current page void addPaged_(idx_t n, const float* x, const idx_t* ids); + void addPaged_( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids); /// Calls addImpl_ for a single page of GPU-resident data void addPage_(idx_t n, const float* x, const idx_t* ids); + void addPage_( + idx_t n, + const void* x, + NumericType numeric_type, + const idx_t* ids); /// Calls searchImpl_ for a single page of GPU-resident data void searchNonPaged_( @@ -164,6 +235,15 @@ class GpuIndex : public faiss::Index { idx_t* outIndicesData, const SearchParameters* params) const; + void searchNonPaged_( + idx_t n, + const void* x, + NumericType numeric_type, + int k, + float* outDistancesData, + idx_t* outIndicesData, + const SearchParameters* params) const; + /// Calls searchImpl_ for a single page of GPU-resident data, /// handling paging of the data and copies from the CPU void searchFromCpuPaged_( @@ -173,6 +253,14 @@ class GpuIndex : public faiss::Index { float* outDistancesData, idx_t* outIndicesData, const SearchParameters* params) const; + void searchFromCpuPaged_( + idx_t n, + const void* x, + NumericType numeric_type, + int k, + float* outDistancesData, + idx_t* outIndicesData, + const SearchParameters* params) const; protected: /// Manages streams, cuBLAS handles and scratch memory for devices diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu index e5ff7344cc..6bc4bc1cf5 100644 --- a/faiss/gpu/GpuIndexCagra.cu +++ b/faiss/gpu/GpuIndexCagra.cu @@ -27,6 +27,7 @@ #include #include #include +#include namespace faiss { namespace gpu { @@ -41,14 +42,18 @@ GpuIndexCagra::GpuIndexCagra( this->is_trained = false; } -void GpuIndexCagra::train(idx_t n, const float* x) { +void GpuIndexCagra::train(idx_t n, const void* x, NumericType numeric_type) { + numeric_type_ = numeric_type; + bool index_is_initialized = !std::holds_alternative(index_); + DeviceScope scope(config_.device); if (this->is_trained) { - FAISS_ASSERT(index_); + FAISS_ASSERT(index_is_initialized); return; } - FAISS_ASSERT(!index_); + // CuvsCagra not initialized + FAISS_ASSERT(!index_is_initialized); std::optional ivf_pq_params = std::nullopt; @@ -82,30 +87,61 @@ void GpuIndexCagra::train(idx_t n, const float* x) { ivf_pq_search_params->preferred_shmem_carveout = cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout; } - index_ = std::make_shared( - this->resources_.get(), - this->d, - cagraConfig_.intermediate_graph_degree, - cagraConfig_.graph_degree, - static_cast(cagraConfig_.build_algo), - cagraConfig_.nn_descent_niter, - cagraConfig_.store_dataset, - this->metric_type, - this->metric_arg, - INDICES_64_BIT, - ivf_pq_params, - ivf_pq_search_params, - cagraConfig_.refine_rate, - cagraConfig_.guarantee_connectivity); - - index_->train(n, x); + + if (numeric_type == NumericType::Float32) { + index_ = std::make_shared>( + this->resources_.get(), + this->d, + cagraConfig_.intermediate_graph_degree, + cagraConfig_.graph_degree, + static_cast(cagraConfig_.build_algo), + cagraConfig_.nn_descent_niter, + cagraConfig_.store_dataset, + this->metric_type, + this->metric_arg, + INDICES_64_BIT, + ivf_pq_params, + ivf_pq_search_params, + cagraConfig_.refine_rate, + cagraConfig_.guarantee_connectivity); + std::get>>(index_)->train( + n, static_cast(x)); + } else if (numeric_type == NumericType::Float16) { + index_ = std::make_shared>( + this->resources_.get(), + this->d, + cagraConfig_.intermediate_graph_degree, + cagraConfig_.graph_degree, + static_cast(cagraConfig_.build_algo), + cagraConfig_.nn_descent_niter, + cagraConfig_.store_dataset, + this->metric_type, + this->metric_arg, + INDICES_64_BIT, + ivf_pq_params, + ivf_pq_search_params, + cagraConfig_.refine_rate, + cagraConfig_.guarantee_connectivity); + std::get>>(index_)->train( + n, static_cast(x)); + } else { + FAISS_THROW_MSG("GpuIndexCagra::train unsupported data type"); + } this->is_trained = true; this->ntotal = n; } +void GpuIndexCagra::train(idx_t n, const float* x) { + train(n, static_cast(x), NumericType::Float32); +} + +void GpuIndexCagra::add(idx_t n, const void* x, NumericType numeric_type) { + train(n, x, numeric_type); +} + void GpuIndexCagra::add(idx_t n, const float* x) { - train(n, x); + add(n, x, NumericType::Float32); } bool GpuIndexCagra::addImplRequiresIDs_() const { @@ -118,17 +154,19 @@ void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) { void GpuIndexCagra::searchImpl_( idx_t n, - const float* x, + const void* x, + NumericType numeric_type, int k, float* distances, idx_t* labels, const SearchParameters* search_params) const { - FAISS_ASSERT(this->is_trained && index_); + FAISS_ASSERT( + this->is_trained && + !std::holds_alternative(index_)); FAISS_ASSERT(n > 0); - - Tensor queries(const_cast(x), {n, this->d}); - Tensor outDistances(distances, {n, k}); - Tensor outLabels(const_cast(labels), {n, k}); + FAISS_THROW_IF_NOT_MSG( + numeric_type == numeric_type_, + "Inconsistent numeric type for train and search"); SearchParametersCagra* params; if (search_params) { @@ -138,41 +176,90 @@ void GpuIndexCagra::searchImpl_( params = new SearchParametersCagra{}; } - index_->search( - queries, - k, - outDistances, - outLabels, - params->max_queries, - params->itopk_size, - params->max_iterations, - static_cast(params->algo), - params->team_size, - params->search_width, - params->min_iterations, - params->thread_block_size, - static_cast(params->hashmap_mode), - params->hashmap_min_bitlen, - params->hashmap_max_fill_rate, - params->num_random_samplings, - params->seed); + Tensor outDistances(distances, {n, k}); + Tensor outLabels(const_cast(labels), {n, k}); + + if (numeric_type == NumericType::Float32) { + Tensor queries( + const_cast(static_cast(x)), {n, this->d}); + + std::get>>(index_)->search( + queries, + k, + outDistances, + outLabels, + params->max_queries, + params->itopk_size, + params->max_iterations, + static_cast(params->algo), + params->team_size, + params->search_width, + params->min_iterations, + params->thread_block_size, + static_cast(params->hashmap_mode), + params->hashmap_min_bitlen, + params->hashmap_max_fill_rate, + params->num_random_samplings, + params->seed); + + } else if (numeric_type == NumericType::Float16) { + Tensor queries( + const_cast(static_cast(x)), {n, this->d}); + + std::get>>(index_)->search( + queries, + k, + outDistances, + outLabels, + params->max_queries, + params->itopk_size, + params->max_iterations, + static_cast(params->algo), + params->team_size, + params->search_width, + params->min_iterations, + params->thread_block_size, + static_cast(params->hashmap_mode), + params->hashmap_min_bitlen, + params->hashmap_max_fill_rate, + params->num_random_samplings, + params->seed); + } else { + FAISS_THROW_MSG("GpuIndexCagra::searchImpl_ unsupported data type"); + } if (not search_params) { delete params; } } -void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) { +void GpuIndexCagra::searchImpl_( + idx_t n, + const float* x, + int k, + float* distances, + idx_t* labels, + const SearchParameters* search_params) const { + searchImpl_( + n, + static_cast(x), + NumericType::Float32, + k, + distances, + labels, + search_params); +} + +void GpuIndexCagra::copyFrom( + const faiss::IndexHNSWCagra* index, + NumericType numeric_type) { FAISS_ASSERT(index); + numeric_type_ = numeric_type; DeviceScope scope(config_.device); GpuIndex::copyFrom(index); - auto base_index = dynamic_cast(index->storage); - FAISS_ASSERT(base_index); - auto distances = base_index->get_xb(); - auto hnsw = index->hnsw; // copy level 0 to a dense knn graph matrix std::vector knn_graph; @@ -189,23 +276,51 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) { } } - index_ = std::make_shared( - this->resources_.get(), - this->d, - index->ntotal, - hnsw.nb_neighbors(0), - distances, - knn_graph.data(), - this->metric_type, - this->metric_arg, - INDICES_64_BIT); + if (numeric_type == NumericType::Float32) { + auto base_index = dynamic_cast(index->storage); + FAISS_ASSERT(base_index); + auto dataset = base_index->get_xb(); + + index_ = std::make_shared>( + this->resources_.get(), + this->d, + index->ntotal, + hnsw.nb_neighbors(0), + dataset, + knn_graph.data(), + this->metric_type, + this->metric_arg, + INDICES_64_BIT); + } else if (numeric_type == NumericType::Float16) { + auto base_index = dynamic_cast(index->storage); + FAISS_ASSERT(base_index); + auto dataset = (half*)base_index->codes.data(); + + index_ = std::make_shared>( + this->resources_.get(), + this->d, + index->ntotal, + hnsw.nb_neighbors(0), + dataset, + knn_graph.data(), + this->metric_type, + this->metric_arg, + INDICES_64_BIT); + } else { + FAISS_THROW_MSG("GpuIndexCagra::copyFrom unsupported data type"); + } this->is_trained = true; } -void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const { - FAISS_ASSERT(index_ && this->is_trained && index); +void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) { + copyFrom(index, NumericType::Float32); +} +void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const { + FAISS_ASSERT( + !std::holds_alternative(index_) && + this->is_trained && index); DeviceScope scope(config_.device); // @@ -215,18 +330,38 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const { // This needs to be zeroed out as this implementation adds vectors to the // cpuIndex instead of copying fields index->ntotal = 0; + index->set_numeric_type(numeric_type_); + + idx_t graph_degree; + + if (numeric_type_ == NumericType::Float32) { + graph_degree = std::get>>(index_) + ->get_knngraph_degree(); + } else if (numeric_type_ == NumericType::Float16) { + graph_degree = std::get>>(index_) + ->get_knngraph_degree(); + } else { + FAISS_THROW_MSG("GpuIndexCagra::copyTo unsupported data type"); + } - auto graph_degree = index_->get_knngraph_degree(); auto M = graph_degree / 2; if (index->storage and index->own_fields) { delete index->storage; } - if (this->metric_type == METRIC_L2) { - index->storage = new IndexFlatL2(index->d); - } else if (this->metric_type == METRIC_INNER_PRODUCT) { - index->storage = new IndexFlatIP(index->d); + // storage depends on numerictype + if (numeric_type_ == NumericType::Float32) { + if (this->metric_type == METRIC_L2) { + index->storage = new IndexFlatL2(index->d); + } else if (this->metric_type == METRIC_INNER_PRODUCT) { + index->storage = new IndexFlatIP(index->d); + } + } else if (numeric_type_ == NumericType::Float16) { + auto qtype = ScalarQuantizer::QT_fp16; + index->storage = + new IndexScalarQuantizer(index->d, qtype, this->metric_type); } + index->own_fields = true; index->keep_max_size_level0 = true; index->hnsw.reset(); @@ -235,32 +370,69 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const { index->hnsw.set_default_probas(M, 1.0 / log(M)); auto n_train = this->ntotal; - float* train_dataset; - auto dataset = index_->get_training_dataset(); bool allocation = false; - if (getDeviceForAddress(dataset) >= 0) { - train_dataset = new float[n_train * index->d]; - allocation = true; - raft::copy( - train_dataset, - dataset, - n_train * index->d, - this->resources_->getRaftHandleCurrentDevice().get_stream()); - } else { - train_dataset = const_cast(dataset); - } - // turn off as level 0 is copied from CAGRA graph - index->init_level0 = false; - if (!index->base_level_only) { - index->add(n_train, train_dataset); - } else { - index->hnsw.prepare_level_tab(n_train, false); - index->storage->add(n_train, train_dataset); - index->ntotal = n_train; - } - if (allocation) { - delete[] train_dataset; + if (numeric_type_ == NumericType::Float32) { + float* train_dataset; + const float* dataset = + std::get>>(index_) + ->get_training_dataset(); + if (getDeviceForAddress(dataset) >= 0) { + train_dataset = new float[n_train * index->d]; + allocation = true; + raft::copy( + train_dataset, + dataset, + n_train * index->d, + this->resources_->getRaftHandleCurrentDevice() + .get_stream()); + } else { + train_dataset = const_cast(dataset); + } + + // turn off as level 0 is copied from CAGRA graph + index->init_level0 = false; + if (!index->base_level_only) { + index->add(n_train, train_dataset); + } else { + index->hnsw.prepare_level_tab(n_train, false); + index->storage->add(n_train, train_dataset); + index->ntotal = n_train; + } + if (allocation) { + delete[] train_dataset; + } + } else if (numeric_type_ == NumericType::Float16) { + half* train_dataset; + const half* dataset = std::get>>(index_) + ->get_training_dataset(); + if (getDeviceForAddress(dataset) >= 0) { + train_dataset = new half[n_train * index->d]; + allocation = true; + raft::copy( + train_dataset, + dataset, + n_train * index->d, + this->resources_->getRaftHandleCurrentDevice() + .get_stream()); + } else { + train_dataset = const_cast(dataset); + } + + index->init_level0 = false; + if (!index->base_level_only) { + FAISS_THROW_MSG( + "Only base level copy is supported for FP16 types in GpuIndexCagra::copyTo"); + } else { + index->hnsw.prepare_level_tab(n_train, false); + index->storage->add_sa_codes( + n_train, (uint8_t*)train_dataset, nullptr); + index->ntotal = n_train; + } + + if (allocation) { + delete[] train_dataset; + } } auto graph = get_knngraph(); @@ -281,8 +453,18 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const { void GpuIndexCagra::reset() { DeviceScope scope(config_.device); - if (index_) { - index_->reset(); + if (!std::holds_alternative(index_)) { + std::visit( + [](auto& index_ptr) { + using IndexPtrT = std::decay_t; + if constexpr (std::is_same_v) { + FAISS_THROW_MSG( + "CuvsCagra not initialized when calling GpuIndexCagra::reset"); + } else { + return index_ptr->reset(); + } + }, + index_); this->ntotal = 0; this->is_trained = false; } else { @@ -291,9 +473,26 @@ void GpuIndexCagra::reset() { } std::vector GpuIndexCagra::get_knngraph() const { - FAISS_ASSERT(index_ && this->is_trained); + FAISS_ASSERT( + !std::holds_alternative(index_) && + this->is_trained); + + return std::visit( + [](auto&& index_ptr) -> std::vector { + using IndexPtrT = std::decay_t; + + if constexpr (std::is_same_v) { + FAISS_THROW_MSG( + "CuvsCagra not initialized when calling GpuIndexCagra::get_knngraph"); + } else { + return index_ptr->get_knngraph(); + } + }, + index_); +} - return index_->get_knngraph(); +faiss::NumericType GpuIndexCagra::get_numeric_type() const { + return numeric_type_; } } // namespace gpu diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h index bbf74cf5ee..cf4a706e7d 100644 --- a/faiss/gpu/GpuIndexCagra.h +++ b/faiss/gpu/GpuIndexCagra.h @@ -27,6 +27,9 @@ #include #include +#include +#include "faiss/Index.h" + namespace faiss { struct IndexHNSWCagra; } @@ -34,6 +37,7 @@ struct IndexHNSWCagra; namespace faiss { namespace gpu { +template class CuvsCagra; enum class graph_build_algo { @@ -252,6 +256,7 @@ struct GpuIndexCagra : public GpuIndex { /// the base dataset. Use this function when you want to add vectors with /// ids. Ref: https://github.com/facebookresearch/faiss/issues/4107 void add(idx_t n, const float* x) override; + void add(idx_t n, const void* x, NumericType numeric_type) override; /// Trains CAGRA based on the given vector data. /// NB: The use of the train function here is to build the CAGRA graph on @@ -259,10 +264,12 @@ struct GpuIndexCagra : public GpuIndex { /// of vectors (without IDs) to the index. There is no external quantizer to /// be trained here. void train(idx_t n, const float* x) override; + void train(idx_t n, const void* x, NumericType numeric_type) override; /// Initialize ourselves from the given CPU index; will overwrite /// all data in ourselves void copyFrom(const faiss::IndexHNSWCagra* index); + void copyFrom(const faiss::IndexHNSWCagra* index, NumericType numeric_type); /// Copy ourselves to the given CPU index; will overwrite all data /// in the index instance @@ -272,6 +279,8 @@ struct GpuIndexCagra : public GpuIndex { std::vector get_knngraph() const; + faiss::NumericType get_numeric_type() const; + protected: bool addImplRequiresIDs_() const override; @@ -285,12 +294,26 @@ struct GpuIndexCagra : public GpuIndex { float* distances, idx_t* labels, const SearchParameters* search_params) const override; + void searchImpl_( + idx_t n, + const void* x, + NumericType numeric_type, + int k, + float* distances, + idx_t* labels, + const SearchParameters* search_params) const override; /// Our configuration options const GpuIndexCagraConfig cagraConfig_; + faiss::NumericType numeric_type_; + /// Instance that we own; contains the inverted lists - std::shared_ptr index_; + std::variant< + std::monostate, + std::shared_ptr>, + std::shared_ptr>> + index_; }; } // namespace gpu diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu index 22009d2852..acac6bcbbb 100644 --- a/faiss/gpu/impl/CuvsCagra.cu +++ b/faiss/gpu/impl/CuvsCagra.cu @@ -34,7 +34,8 @@ namespace faiss { namespace gpu { -CuvsCagra::CuvsCagra( +template +CuvsCagra::CuvsCagra( GpuResources* resources, int dim, idx_t intermediate_graph_degree, @@ -83,12 +84,13 @@ CuvsCagra::CuvsCagra( reset(); } -CuvsCagra::CuvsCagra( +template +CuvsCagra::CuvsCagra( GpuResources* resources, int dim, idx_t n, int graph_degree, - const float* distances, + const data_t* dataset, const idx_t* knn_graph, faiss::MetricType metric, float metricArg, @@ -104,18 +106,18 @@ CuvsCagra::CuvsCagra( indicesOptions == faiss::gpu::INDICES_64_BIT, "only INDICES_64_BIT is supported for cuVS CAGRA index"); - auto distances_on_gpu = getDeviceForAddress(distances) >= 0; + auto dataset_on_gpu = getDeviceForAddress(dataset) >= 0; auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0; - FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu); + FAISS_ASSERT(dataset_on_gpu == knn_graph_on_gpu); - storage_ = distances; + storage_ = dataset; n_ = n; const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - if (distances_on_gpu && knn_graph_on_gpu) { + if (dataset_on_gpu && knn_graph_on_gpu) { raft_handle.sync_stream(); // Copying to host so that cuvs::neighbors::cagra::index // creates an owning copy of the knn graph on device @@ -126,17 +128,16 @@ CuvsCagra::CuvsCagra( thrust::device_ptr(knn_graph + (n * graph_degree)), knn_graph_copy.data_handle()); - auto distances_mds = - raft::make_device_matrix_view( - distances, n, dim); + auto dataset_mds = raft::make_device_matrix_view( + dataset, n, dim); cuvs_index = std::make_shared< - cuvs::neighbors::cagra::index>( + cuvs::neighbors::cagra::index>( raft_handle, metricFaissToCuvs(metric_, false), - distances_mds, + dataset_mds, raft::make_const_mdspan(knn_graph_copy.view())); - } else if (!distances_on_gpu && !knn_graph_on_gpu) { + } else if (!dataset_on_gpu && !knn_graph_on_gpu) { // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph auto knn_graph_copy = raft::make_host_matrix(n, graph_degree); @@ -145,22 +146,23 @@ CuvsCagra::CuvsCagra( knn_graph + (n * graph_degree), knn_graph_copy.data_handle()); - auto distances_mds = raft::make_host_matrix_view( - distances, n, dim); + auto dataset_mds = raft::make_host_matrix_view( + dataset, n, dim); cuvs_index = std::make_shared< - cuvs::neighbors::cagra::index>( + cuvs::neighbors::cagra::index>( raft_handle, metricFaissToCuvs(metric_, false), - distances_mds, + dataset_mds, raft::make_const_mdspan(knn_graph_copy.view())); } else { FAISS_THROW_MSG( - "distances and knn_graph must both be in device or host memory"); + "dataset and knn_graph must both be in device or host memory"); } } -void CuvsCagra::train(idx_t n, const float* x) { +template +void CuvsCagra::train(idx_t n, const data_t* x) { storage_ = x; n_ = n; @@ -193,24 +195,25 @@ void CuvsCagra::train(idx_t n, const float* x) { } if (getDeviceForAddress(x) >= 0) { - auto dataset = - raft::make_device_matrix_view(x, n, dim_); + auto dataset = raft::make_device_matrix_view( + x, n, dim_); cuvs_index = std::make_shared< - cuvs::neighbors::cagra::index>( + cuvs::neighbors::cagra::index>( cuvs::neighbors::cagra::build( raft_handle, index_params_, dataset)); } else { auto dataset = - raft::make_host_matrix_view(x, n, dim_); + raft::make_host_matrix_view(x, n, dim_); cuvs_index = std::make_shared< - cuvs::neighbors::cagra::index>( + cuvs::neighbors::cagra::index>( cuvs::neighbors::cagra::build( raft_handle, index_params_, dataset)); } } -void CuvsCagra::search( - Tensor& queries, +template +void CuvsCagra::search( + Tensor& queries, int k, Tensor& outDistances, Tensor& outIndices, @@ -239,18 +242,18 @@ void CuvsCagra::search( if (!store_dataset_) { if (getDeviceForAddress(storage_) >= 0) { - auto dataset = raft::make_device_matrix_view( + auto dataset = raft::make_device_matrix_view( storage_, n_, dim_); cuvs_index->update_dataset(raft_handle, dataset); } else { - auto dataset = raft::make_host_matrix_view( + auto dataset = raft::make_host_matrix_view( storage_, n_, dim_); cuvs_index->update_dataset(raft_handle, dataset); } store_dataset_ = true; } - auto queries_view = raft::make_device_matrix_view( + auto queries_view = raft::make_device_matrix_view( queries.data(), numQueries, cols); auto distances_view = raft::make_device_matrix_view( outDistances.data(), numQueries, k_); @@ -291,16 +294,19 @@ void CuvsCagra::search( indices_view.data_handle()); } -void CuvsCagra::reset() { +template +void CuvsCagra::reset() { cuvs_index.reset(); } -idx_t CuvsCagra::get_knngraph_degree() const { +template +idx_t CuvsCagra::get_knngraph_degree() const { FAISS_ASSERT(cuvs_index); return static_cast(cuvs_index->graph_degree()); } -std::vector CuvsCagra::get_knngraph() const { +template +std::vector CuvsCagra::get_knngraph() const { FAISS_ASSERT(cuvs_index); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); @@ -322,9 +328,12 @@ std::vector CuvsCagra::get_knngraph() const { return host_graph; } -const float* CuvsCagra::get_training_dataset() const { +template +const data_t* CuvsCagra::get_training_dataset() const { return storage_; } +template class CuvsCagra; +template class CuvsCagra; } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh index 5202c4251c..a10e9fba61 100644 --- a/faiss/gpu/impl/CuvsCagra.cuh +++ b/faiss/gpu/impl/CuvsCagra.cuh @@ -45,6 +45,7 @@ enum class cagra_hash_mode { HASH, SMALL, AUTO }; namespace gpu { +template class CuvsCagra { public: CuvsCagra( @@ -70,7 +71,7 @@ class CuvsCagra { int dim, idx_t n, int graph_degree, - const float* distances, + const data_t* dataset, const idx_t* knn_graph, faiss::MetricType metric, float metricArg, @@ -78,10 +79,10 @@ class CuvsCagra { ~CuvsCagra() = default; - void train(idx_t n, const float* x); + void train(idx_t n, const data_t* x); void search( - Tensor& queries, + Tensor& queries, int k, Tensor& outDistances, Tensor& outIndices, @@ -105,14 +106,14 @@ class CuvsCagra { std::vector get_knngraph() const; - const float* get_training_dataset() const; + const data_t* get_training_dataset() const; private: /// Collection of GPU resources that we use GpuResources* resources_; /// Training dataset - const float* storage_; + const data_t* storage_; int n_; /// Expected dimensionality of the vectors @@ -147,9 +148,8 @@ class CuvsCagra { bool guarantee_connectivity_ = false; /// Instance of trained cuVS CAGRA index - std::shared_ptr> cuvs_index{ + std::shared_ptr> cuvs_index{ nullptr}; }; - } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu index 16b94703c4..31dcfc03c9 100644 --- a/faiss/gpu/test/TestGpuIndexCagra.cu +++ b/faiss/gpu/test/TestGpuIndexCagra.cu @@ -21,6 +21,7 @@ * limitations under the License. */ +#include #include #include #include @@ -195,6 +196,142 @@ TEST(TestGpuIndexCagra, Float32_Query_IP) { queryTest(faiss::METRIC_INNER_PRODUCT, 0.98); } +void queryTestFP16(faiss::MetricType metric, double expected_recall) { + for (int tries = 0; tries < 5; ++tries) { + Options opt; + if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT && + metric == faiss::METRIC_INNER_PRODUCT) { + continue; + } + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data()); + } + + // train cpu index + faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric); + cpuIndex.hnsw.efConstruction = opt.k * 2; + cpuIndex.add(opt.numTrain, trainVecs.data()); + + // train gpu index + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + faiss::gpu::GpuIndexCagraConfig config; + config.device = opt.device; + config.graph_degree = opt.graphDegree; + config.intermediate_graph_degree = opt.intermediateGraphDegree; + config.build_algo = opt.buildAlgo; + + faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config); + + // Create half vector + std::vector<__half> trainVecs_half(trainVecs.size()); + + for (size_t i = 0; i < trainVecs.size(); ++i) { + trainVecs_half[i] = __float2half(trainVecs[i]); + } + + gpuIndex.train( + opt.numTrain, + static_cast(trainVecs_half.data()), + faiss::NumericType::Float16); + + // query + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data()); + } + + std::vector refDistance(opt.numQuery * opt.k, 0); + std::vector refIndices(opt.numQuery * opt.k, -1); + faiss::SearchParametersHNSW cpuSearchParams; + cpuSearchParams.efSearch = opt.k * 2; + cpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + refDistance.data(), + refIndices.data(), + &cpuSearchParams); + + // test quality of searches + auto gpuRes = res.getResources(); + auto devAlloc = faiss::gpu::makeDevAlloc( + faiss::gpu::AllocType::FlatData, + gpuRes->getDefaultStreamCurrentDevice()); + faiss::gpu::DeviceTensor testDistance( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + faiss::gpu::DeviceTensor testIndices( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + // Create half vector + std::vector<__half> queryVecs_half(queryVecs.size()); + + for (size_t i = 0; i < queryVecs.size(); ++i) { + queryVecs_half[i] = __float2half(queryVecs[i]); + } + gpuIndex.search( + opt.numQuery, + queryVecs_half.data(), + faiss::NumericType::Float16, + opt.k, + testDistance.data(), + testIndices.data()); + + auto refDistanceDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + refDistance, + gpuRes->getDefaultStreamCurrentDevice()); + auto refIndicesDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + refIndices, + gpuRes->getDefaultStreamCurrentDevice()); + + auto raft_handle = gpuRes->getRaftHandleCurrentDevice(); + + auto ref_dis_mds = raft::make_device_matrix_view( + refDistanceDev.data(), opt.numQuery, opt.k); + auto ref_dis_mds_opt = + std::optional>( + ref_dis_mds); + auto ref_ind_mds = + raft::make_device_matrix_view( + refIndicesDev.data(), opt.numQuery, opt.k); + + auto test_dis_mds = raft::make_device_matrix_view( + testDistance.data(), opt.numQuery, opt.k); + auto test_dis_mds_opt = + std::optional>( + test_dis_mds); + + auto test_ind_mds = + raft::make_device_matrix_view( + testIndices.data(), opt.numQuery, opt.k); + + double scalar_init = 0; + auto recall_score = raft::make_host_scalar(scalar_init); + + raft::stats::neighborhood_recall( + raft_handle, + test_ind_mds, + ref_ind_mds, + recall_score.view(), + test_dis_mds_opt, + ref_dis_mds_opt); + ASSERT_TRUE(*recall_score.data_handle() > expected_recall); + } +} + +TEST(TestGpuIndexCagra, Float16_Query_L2) { + queryTestFP16(faiss::METRIC_L2, 0.98); +} + +TEST(TestGpuIndexCagra, Float16_Query_IP) { + queryTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98); +} + void copyToTest( faiss::MetricType metric, double expected_recall, @@ -351,6 +488,158 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) { copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true); } +void copyToTestFP16( + faiss::MetricType metric, + double expected_recall, + bool base_level_only) { + for (int tries = 0; tries < 5; ++tries) { + Options opt; + if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT && + metric == faiss::METRIC_INNER_PRODUCT) { + continue; + } + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data()); + } + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data()); + } + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + // train gpu index and copy to cpu index + faiss::gpu::GpuIndexCagraConfig config; + config.device = opt.device; + config.graph_degree = opt.graphDegree; + config.intermediate_graph_degree = opt.intermediateGraphDegree; + config.build_algo = opt.buildAlgo; + + // Create half vector + std::vector<__half> trainVecs_half(trainVecs.size()); + + for (size_t i = 0; i < trainVecs.size(); ++i) { + trainVecs_half[i] = __float2half(trainVecs[i]); + } + + faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config); + gpuIndex.train( + opt.numTrain, + static_cast(trainVecs_half.data()), + faiss::NumericType::Float16); + + faiss::IndexHNSWCagra copiedCpuIndex( + opt.dim, + opt.graphDegree / 2, + metric, + faiss::NumericType::Float16); + copiedCpuIndex.base_level_only = base_level_only; + gpuIndex.copyTo(&copiedCpuIndex); + copiedCpuIndex.hnsw.efConstruction = opt.k * 2; + + // train cpu index + faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric); + cpuIndex.hnsw.efConstruction = opt.k * 2; + cpuIndex.add(opt.numTrain, trainVecs.data()); + + // query indexes + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data()); + } + + std::vector refDistance(opt.numQuery * opt.k, 0); + std::vector refIndices(opt.numQuery * opt.k, -1); + faiss::SearchParametersHNSW cpuSearchParams; + cpuSearchParams.efSearch = opt.k * 2; + cpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + refDistance.data(), + refIndices.data(), + &cpuSearchParams); + + std::vector copyRefDistance(opt.numQuery * opt.k, 0); + std::vector copyRefIndices(opt.numQuery * opt.k, -1); + faiss::SearchParametersHNSW cpuSearchParamstwo; + cpuSearchParamstwo.efSearch = opt.k * 2; + copiedCpuIndex.search( + opt.numQuery, + queryVecs.data(), + opt.k, + copyRefDistance.data(), + copyRefIndices.data(), + &cpuSearchParamstwo); + + // test quality of search + auto gpuRes = res.getResources(); + + auto refDistanceDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + refDistance, + gpuRes->getDefaultStreamCurrentDevice()); + auto refIndicesDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + refIndices, + gpuRes->getDefaultStreamCurrentDevice()); + + auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + copyRefDistance, + gpuRes->getDefaultStreamCurrentDevice()); + auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary( + gpuRes.get(), + copyRefIndices, + gpuRes->getDefaultStreamCurrentDevice()); + + auto raft_handle = gpuRes->getRaftHandleCurrentDevice(); + + auto ref_dis_mds = raft::make_device_matrix_view( + refDistanceDev.data(), opt.numQuery, opt.k); + auto ref_dis_mds_opt = + std::optional>( + ref_dis_mds); + auto ref_ind_mds = + raft::make_device_matrix_view( + refIndicesDev.data(), opt.numQuery, opt.k); + + auto copy_ref_dis_mds = raft::make_device_matrix_view( + copyRefDistanceDev.data(), opt.numQuery, opt.k); + auto copy_ref_dis_mds_opt = + std::optional>( + copy_ref_dis_mds); + auto copy_ref_ind_mds = + raft::make_device_matrix_view( + copyRefIndicesDev.data(), opt.numQuery, opt.k); + + double scalar_init = 0; + auto recall_score = raft::make_host_scalar(scalar_init); + + raft::stats::neighborhood_recall( + raft_handle, + copy_ref_ind_mds, + ref_ind_mds, + recall_score.view(), + copy_ref_dis_mds_opt, + ref_dis_mds_opt); + ASSERT_TRUE(*recall_score.data_handle() > expected_recall); + } +} + +// For fp16, only base level copy is supported +TEST(TestGpuIndexCagra, Float16_CopyTo_L2_BaseLevelOnly) { + copyToTestFP16(faiss::METRIC_L2, 0.98, true); +} + +TEST(TestGpuIndexCagra, Float16_CopyTo_IP_BaseLevelOnly) { + copyToTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98, true); +} + void copyFromTest(faiss::MetricType metric, double expected_recall) { for (int tries = 0; tries < 5; ++tries) { Options opt; @@ -465,6 +754,147 @@ TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) { copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98); } +void copyFromTestFP16(faiss::MetricType metric, double expected_recall) { + for (int tries = 0; tries < 5; ++tries) { + Options opt; + if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT && + metric == faiss::METRIC_INNER_PRODUCT) { + continue; + } + + std::vector trainVecs = + faiss::gpu::randVecs(opt.numTrain, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data()); + } + + // train cpu index + faiss::IndexHNSWCagra cpuIndex( + opt.dim, + opt.graphDegree / 2, + metric, + faiss::NumericType::Float16); + cpuIndex.hnsw.efConstruction = opt.k * 2; + cpuIndex.add(opt.numTrain, trainVecs.data()); + + faiss::gpu::StandardGpuResources res; + res.noTempMemory(); + + // convert to gpu index + faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric); + copiedGpuIndex.copyFrom(&cpuIndex, faiss::NumericType::Float16); + + // train gpu index + faiss::gpu::GpuIndexCagraConfig config; + config.device = opt.device; + config.graph_degree = opt.graphDegree; + config.intermediate_graph_degree = opt.intermediateGraphDegree; + config.build_algo = opt.buildAlgo; + + // faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config); + // gpuIndex.train(opt.numTrain, trainVecs.data()); + + faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config); + + // Create half vector + std::vector<__half> trainVecs_half(trainVecs.size()); + + for (size_t i = 0; i < trainVecs.size(); ++i) { + trainVecs_half[i] = __float2half(trainVecs[i]); + } + + gpuIndex.train( + opt.numTrain, + static_cast(trainVecs_half.data()), + faiss::NumericType::Float16); + + // query + auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim); + if (metric == faiss::METRIC_INNER_PRODUCT) { + faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data()); + } + + // Create half vector + std::vector<__half> queryVecs_half(queryVecs.size()); + + for (size_t i = 0; i < queryVecs.size(); ++i) { + queryVecs_half[i] = __float2half(queryVecs[i]); + } + + auto gpuRes = res.getResources(); + auto devAlloc = faiss::gpu::makeDevAlloc( + faiss::gpu::AllocType::FlatData, + gpuRes->getDefaultStreamCurrentDevice()); + faiss::gpu::DeviceTensor copyTestDistance( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + faiss::gpu::DeviceTensor copyTestIndices( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + copiedGpuIndex.search( + opt.numQuery, + queryVecs_half.data(), + faiss::NumericType::Float16, + opt.k, + copyTestDistance.data(), + copyTestIndices.data()); + + faiss::gpu::DeviceTensor testDistance( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + faiss::gpu::DeviceTensor testIndices( + gpuRes.get(), devAlloc, {opt.numQuery, opt.k}); + gpuIndex.search( + opt.numQuery, + queryVecs_half.data(), + faiss::NumericType::Float16, + opt.k, + testDistance.data(), + testIndices.data()); + + // test quality of searches + auto raft_handle = gpuRes->getRaftHandleCurrentDevice(); + + auto test_dis_mds = raft::make_device_matrix_view( + testDistance.data(), opt.numQuery, opt.k); + auto test_dis_mds_opt = + std::optional>( + test_dis_mds); + + auto test_ind_mds = + raft::make_device_matrix_view( + testIndices.data(), opt.numQuery, opt.k); + + auto copy_test_dis_mds = + raft::make_device_matrix_view( + copyTestDistance.data(), opt.numQuery, opt.k); + auto copy_test_dis_mds_opt = + std::optional>( + copy_test_dis_mds); + + auto copy_test_ind_mds = + raft::make_device_matrix_view( + copyTestIndices.data(), opt.numQuery, opt.k); + + double scalar_init = 0; + auto recall_score = raft::make_host_scalar(scalar_init); + + raft::stats::neighborhood_recall( + raft_handle, + copy_test_ind_mds, + test_ind_mds, + recall_score.view(), + copy_test_dis_mds_opt, + test_dis_mds_opt); + ASSERT_TRUE(*recall_score.data_handle() > expected_recall); + } +} + +TEST(TestGpuIndexCagra, Float16_CopyFrom_L2) { + copyFromTestFP16(faiss::METRIC_L2, 0.98); +} + +TEST(TestGpuIndexCagra, Float16_CopyFrom_IP) { + copyFromTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py index 164f700e9f..9c9297c888 100644 --- a/faiss/gpu/test/test_cagra.py +++ b/faiss/gpu/test/test_cagra.py @@ -8,7 +8,7 @@ import faiss from faiss.contrib import datasets, evaluation - +import numpy as np @unittest.skipIf( "CUVS" not in faiss.get_compile_options(), @@ -42,6 +42,40 @@ def test_compute_GT_L2(self): def test_compute_GT_IP(self): self.do_compute_GT(faiss.METRIC_INNER_PRODUCT) +@unittest.skipIf( + "CUVS" not in faiss.get_compile_options(), + "only if cuVS is compiled in") +class TestComputeGTFP16(unittest.TestCase): + + def do_compute_GT(self, metric): + d = 64 + k = 12 + ds = datasets.SyntheticDataset(d, 0, 10000, 100) + Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric) + + res = faiss.StandardGpuResources() + + # attempt to set custom IVF-PQ params + cagraIndexConfig = faiss.GpuIndexCagraConfig() + cagraIndexIVFPQConfig = faiss.IVFPQBuildCagraConfig() + cagraIndexIVFPQConfig.kmeans_trainset_fraction = 0.1 + cagraIndexConfig.ivf_pq_params = cagraIndexIVFPQConfig + cagraIndexConfig.build_algo = faiss.graph_build_algo_IVF_PQ + + index = faiss.GpuIndexCagra(res, d, metric, cagraIndexConfig) + fp16_data = ds.get_database().astype(np.float16) + index.train(fp16_data, faiss.Float16) + fp16_queries = ds.get_queries().astype(np.float16) + Dnew, Inew = index.search(fp16_queries, k, numeric_type=faiss.Float16) + + evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k) + + def test_compute_GT_L2(self): + self.do_compute_GT(faiss.METRIC_L2) + + def test_compute_GT_IP(self): + self.do_compute_GT(faiss.METRIC_INNER_PRODUCT) + @unittest.skipIf( "CUVS" not in faiss.get_compile_options(), "only if cuVS is compiled in") @@ -76,3 +110,40 @@ def test_interop_L2(self): def test_interop_IP(self): self.do_interop(faiss.METRIC_INNER_PRODUCT) + +@unittest.skipIf( + "CUVS" not in faiss.get_compile_options(), + "only if cuVS is compiled in") +class TestInteropFP16(unittest.TestCase): + + def do_interop(self, metric): + d = 64 + k = 12 + ds = datasets.SyntheticDataset(d, 0, 10000, 100) + + res = faiss.StandardGpuResources() + + index = faiss.GpuIndexCagra(res, d, metric) + fp16_data = ds.get_database().astype(np.float16) + index.train(fp16_data, faiss.Float16) + fp16_queries = ds.get_queries().astype(np.float16) + Dnew, Inew = index.search(fp16_queries, k, numeric_type=faiss.Float16) + + cpu_index = faiss.index_gpu_to_cpu(index) + Dref, Iref = cpu_index.search(ds.get_queries(), k) + + evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k) + + deserialized_index = faiss.deserialize_index( + faiss.serialize_index(cpu_index)) + + gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index) + Dnew2, Inew2 = gpu_index.search(fp16_queries, k, numeric_type=faiss.Float16) + + evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k) + + def test_interop_L2(self): + self.do_interop(faiss.METRIC_L2) + + def test_interop_IP(self): + self.do_interop(faiss.METRIC_INNER_PRODUCT) diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 775e8b951c..2da075227e 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -1118,6 +1118,7 @@ Index* read_index(IOReader* f, int io_flags) { auto idx_hnsw_cagra = dynamic_cast(idxhnsw); READ1(idx_hnsw_cagra->base_level_only); READ1(idx_hnsw_cagra->num_base_level_search_entrypoints); + READ1(idx_hnsw_cagra->numeric_type_); } read_HNSW(&idxhnsw->hnsw, f); idxhnsw->storage = read_index(f, io_flags); diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 5b65454fe3..02bb3a03a3 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -781,6 +781,7 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { auto idx_hnsw_cagra = dynamic_cast(idxhnsw); WRITE1(idx_hnsw_cagra->base_level_only); WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints); + WRITE1(idx_hnsw_cagra->numeric_type_); } write_HNSW(&idxhnsw->hnsw, f); if (io_flags & IO_FLAG_SKIP_STORAGE) { diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py index 2491aa8914..51d8f570cb 100644 --- a/faiss/python/class_wrappers.py +++ b/faiss/python/class_wrappers.py @@ -211,7 +211,7 @@ def replacement_build(self, x, graph): def handle_Index(the_class): - def replacement_add(self, x): + def replacement_add(self, x, numeric_type = faiss.Float32): """Adds vectors to the index. The index must be trained before vectors can be added to it. The vectors are implicitly numbered in sequence. When `n` vectors are @@ -226,7 +226,10 @@ def replacement_add(self, x): n, d = x.shape assert d == self.d - x = np.ascontiguousarray(x, dtype='float32') + if numeric_type == faiss.Float32: + x = np.ascontiguousarray(x, dtype='float32') + else: + x = np.ascontiguousarray(x, dtype='float16') self.add_c(n, swig_ptr(x)) def replacement_add_with_ids(self, x, ids): @@ -282,7 +285,7 @@ def replacement_assign(self, x, k, labels=None): self.assign_c(n, swig_ptr(x), swig_ptr(labels), k) return labels - def replacement_train(self, x): + def replacement_train(self, x, numeric_type = faiss.Float32): """Trains the index on a representative set of vectors. The index must be trained before vectors can be added to it. @@ -294,10 +297,15 @@ def replacement_train(self, x): """ n, d = x.shape assert d == self.d - x = np.ascontiguousarray(x, dtype='float32') - self.train_c(n, swig_ptr(x)) + if numeric_type == faiss.Float32: + x = np.ascontiguousarray(x, dtype='float32') + self.train_c(n, swig_ptr(x)) + else: + x = np.ascontiguousarray(x, dtype='float16') + self.train_c(n, swig_ptr(x), faiss.Float16) + - def replacement_search(self, x, k, *, params=None, D=None, I=None): + def replacement_search(self, x, k, *, params=None, D=None, I=None, numeric_type = faiss.Float32): """Find the k nearest neighbors of the set of vectors x in the index. Parameters @@ -325,7 +333,10 @@ def replacement_search(self, x, k, *, params=None, D=None, I=None): """ n, d = x.shape - x = np.ascontiguousarray(x, dtype='float32') + if numeric_type == faiss.Float32: + x = np.ascontiguousarray(x, dtype='float32') + else: + x = np.ascontiguousarray(x, dtype='float16') assert d == self.d assert k > 0 @@ -340,7 +351,10 @@ def replacement_search(self, x, k, *, params=None, D=None, I=None): else: assert I.shape == (n, k) - self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params) + if numeric_type == faiss.Float32: + self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params) + else: + self.search_c(n, swig_ptr(x), faiss.Float16, k, swig_ptr(D), swig_ptr(I), params) return D, I def replacement_search_and_reconstruct(self, x, k, *, params=None, D=None, I=None, R=None):