diff --git a/faiss/Index.h b/faiss/Index.h
index 2474f08be4..95af05df74 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -11,6 +11,8 @@
 #define FAISS_INDEX_H
 
 #include <faiss/MetricType.h>
+#include <faiss/impl/FaissAssert.h>
+
 #include <cstdio>
 #include <sstream>
 #include <string>
@@ -56,6 +58,23 @@ struct IDSelector;
 struct RangeSearchResult;
 struct DistanceComputer;
 
+enum NumericType {
+    Float32,
+    Float16,
+};
+
+inline size_t get_numeric_type_size(NumericType numeric_type) {
+    switch (numeric_type) {
+        case NumericType::Float32:
+            return 4;
+        case NumericType::Float16:
+            return 2;
+        default:
+            FAISS_THROW_MSG(
+                    "Unknown Numeric Type. Only supports Float32, Float16");
+    }
+}
+
 /** Parent class for the optional search paramenters.
  *
  * Sub-classes with additional search parameters should inherit this class.
@@ -107,6 +126,14 @@ struct Index {
      */
     virtual void train(idx_t n, const float* x);
 
+    virtual void train(idx_t n, const void* x, NumericType numeric_type) {
+        if (numeric_type == NumericType::Float32) {
+            train(n, static_cast<const float*>(x));
+        } else {
+            FAISS_THROW_MSG("Index::train: unsupported numeric type");
+        }
+    }
+
     /** Add n vectors of dimension d to the index.
      *
      * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
@@ -117,6 +144,14 @@ struct Index {
      */
     virtual void add(idx_t n, const float* x) = 0;
 
+    virtual void add(idx_t n, const void* x, NumericType numeric_type) {
+        if (numeric_type == NumericType::Float32) {
+            add(n, static_cast<const float*>(x));
+        } else {
+            FAISS_THROW_MSG("Index::add: unsupported numeric type");
+        }
+    }
+
     /** Same as add, but stores xids instead of sequential ids.
      *
      * The default implementation fails with an assertion, as it is
@@ -127,6 +162,17 @@ struct Index {
      * @param xids      if non-null, ids to store for the vectors (size n)
      */
     virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
+    virtual void add_with_ids(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            const idx_t* xids) {
+        if (numeric_type == NumericType::Float32) {
+            add_with_ids(n, static_cast<const float*>(x), xids);
+        } else {
+            FAISS_THROW_MSG("Index::add_with_ids: unsupported numeric type");
+        }
+    }
 
     /** query n vectors of dimension d to the index.
      *
@@ -147,6 +193,26 @@ struct Index {
             idx_t* labels,
             const SearchParameters* params = nullptr) const = 0;
 
+    virtual void search(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const {
+        if (numeric_type == NumericType::Float32) {
+            search(n,
+                   static_cast<const float*>(x),
+                   k,
+                   distances,
+                   labels,
+                   params);
+        } else {
+            FAISS_THROW_MSG("Index::search: unsupported numeric type");
+        }
+    }
+
     /** query n vectors of dimension d to the index.
      *
      * return all vectors with distance < radius. Note that many
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 5983e9d831..1ee15f4484 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -19,6 +19,7 @@
 #include <random>
 
 #include <cstdint>
+#include "faiss/Index.h"
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
@@ -893,15 +894,31 @@ IndexHNSWCagra::IndexHNSWCagra() {
     is_trained = true;
 }
 
-IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
-        : IndexHNSW(
-                  (metric == METRIC_L2)
-                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
-                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
-                  M) {
+IndexHNSWCagra::IndexHNSWCagra(
+        int d,
+        int M,
+        MetricType metric,
+        NumericType numeric_type)
+        : IndexHNSW(d, M, metric) {
     FAISS_THROW_IF_NOT_MSG(
             ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
             "unsupported metric type for IndexHNSWCagra");
+    numeric_type_ = numeric_type;
+    if (numeric_type == NumericType::Float32) {
+        // Use flat storage with full precision for fp32
+        storage = (metric == METRIC_L2)
+                ? static_cast<Index*>(new IndexFlatL2(d))
+                : static_cast<Index*>(new IndexFlatIP(d));
+    } else if (numeric_type == NumericType::Float16) {
+        auto qtype = ScalarQuantizer::QT_fp16;
+        storage = new IndexScalarQuantizer(d, qtype, metric);
+    } else {
+        FAISS_THROW_MSG(
+                "Unsupported numeric_type: only F16 and F32 are supported for IndexHNSWCagra");
+    }
+
+    metric_arg = storage->metric_arg;
+
     own_fields = true;
     is_trained = true;
     init_level0 = true;
@@ -967,4 +984,12 @@ void IndexHNSWCagra::search(
     }
 }
 
+faiss::NumericType IndexHNSWCagra::get_numeric_type() const {
+    return numeric_type_;
+}
+
+void IndexHNSWCagra::set_numeric_type(faiss::NumericType numeric_type) {
+    numeric_type_ = numeric_type;
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index 2d983b3c16..c6e80df462 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <vector>
+#include "faiss/Index.h"
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexPQ.h>
@@ -170,7 +171,11 @@ struct IndexHNSW2Level : IndexHNSW {
 
 struct IndexHNSWCagra : IndexHNSW {
     IndexHNSWCagra();
-    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+    IndexHNSWCagra(
+            int d,
+            int M,
+            MetricType metric = METRIC_L2,
+            NumericType numeric_type = NumericType::Float32);
 
     /// When set to true, the index is immutable.
     /// This option is used to copy the knn graph from GpuIndexCagra
@@ -195,6 +200,10 @@ struct IndexHNSWCagra : IndexHNSW {
             float* distances,
             idx_t* labels,
             const SearchParameters* params = nullptr) const override;
+
+    faiss::NumericType get_numeric_type() const;
+    void set_numeric_type(faiss::NumericType numeric_type);
+    NumericType numeric_type_;
 };
 
 } // namespace faiss
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 575ee2e0a5..faea7519d9 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -95,6 +95,9 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
 #if defined USE_NVIDIA_CUVS
     else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
         IndexHNSWCagra* res = new IndexHNSWCagra();
+        if (icg->get_numeric_type() == faiss::NumericType::Float16) {
+            res->base_level_only = true;
+        }
         icg->copyTo(res);
         return res;
     }
@@ -236,7 +239,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         GpuIndexCagra* res =
                 new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
-        res->copyFrom(icg);
+        res->copyFrom(icg, icg->get_numeric_type());
         return res;
     }
 #endif
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 033cb9aaa9..fb157c84a7 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -16,6 +16,7 @@
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
 
 #include <algorithm>
 #include <limits>
@@ -109,12 +110,20 @@ size_t GpuIndex::getMinPagingSize() const {
     return minPagedSize_;
 }
 
+void GpuIndex::add(idx_t n, const void* x, NumericType numeric_type) {
+    add_with_ids(n, x, numeric_type, nullptr);
+}
+
 void GpuIndex::add(idx_t n, const float* x) {
     // Pass to add_with_ids
-    add_with_ids(n, x, nullptr);
+    add(n, x, NumericType::Float32);
 }
 
-void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) {
+void GpuIndex::add_with_ids(
+        idx_t n,
+        const void* x,
+        NumericType numeric_type,
+        const idx_t* ids) {
     DeviceScope scope(config_.device);
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
 
@@ -134,18 +143,33 @@ void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) {
         }
     }
 
-    addPaged_(n, x, ids ? ids : generatedIds.data());
+    addPaged_(n, x, numeric_type, ids ? ids : generatedIds.data());
 }
 
-void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
-    if (n > 0) {
-        idx_t totalSize = n * this->d * sizeof(float);
+void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) {
+    add_with_ids(n, static_cast<const void*>(x), NumericType::Float32, ids);
+}
+
+void GpuIndex::addPaged_(
+        idx_t n,
+        const void* x,
+        NumericType numeric_type,
+        const idx_t* ids) {
+    if (n <= 0) {
+        return;
+    }
+
+    auto dispatch = [&](auto dummy_type) {
+        using data_t = decltype(dummy_type);
+        const data_t* typed_x = reinterpret_cast<const data_t*>(x);
+
+        idx_t totalSize = n * this->d * sizeof(data_t);
 
         if (!should_use_cuvs(config_) &&
             (totalSize > kAddPageSize || n > kAddVecSize)) {
             // How many vectors fit into kAddPageSize?
             idx_t maxNumVecsForPageSize =
-                    kAddPageSize / (this->d * sizeof(float));
+                    kAddPageSize / (this->d * sizeof(data_t));
 
             // Always add at least 1 vector, if we have huge vectors
             maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, idx_t(1));
@@ -155,16 +179,35 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
 
             for (idx_t i = 0; i < n; i += tileSize) {
                 auto curNum = std::min(tileSize, n - i);
-
-                addPage_(curNum, x + i * this->d, ids ? ids + i : nullptr);
+                addPage_(
+                        curNum,
+                        static_cast<const void*>(typed_x + i * this->d),
+                        numeric_type,
+                        ids ? ids + i : nullptr);
             }
         } else {
-            addPage_(n, x, ids);
+            addPage_(n, static_cast<const void*>(typed_x), numeric_type, ids);
         }
+    };
+
+    if (numeric_type == NumericType::Float32) {
+        dispatch(float{});
+    } else if (numeric_type == NumericType::Float16) {
+        dispatch(half{});
+    } else {
+        FAISS_THROW_MSG("GpuIndex::addPaged_: Unsupported numeric type");
     }
 }
 
-void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) {
+void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
+    addPaged_(n, static_cast<const void*>(x), NumericType::Float32, ids);
+}
+
+void GpuIndex::addPage_(
+        idx_t n,
+        const void* x,
+        NumericType numeric_type,
+        const idx_t* ids) {
     // At this point, `x` can be resident on CPU or GPU, and `ids` may be
     // resident on CPU, GPU or may be null.
     //
@@ -172,27 +215,51 @@ void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) {
     // GPU.
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
-    auto vecs = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(x),
-            stream,
-            {n, this->d});
+    auto dispatch = [&](auto dummy_type) {
+        using data_t = decltype(dummy_type);
 
-    if (ids) {
-        auto indices = toDeviceTemporary<idx_t, 1>(
+        auto vecs = toDeviceTemporary<data_t, 2>(
                 resources_.get(),
                 config_.device,
-                const_cast<idx_t*>(ids),
+                const_cast<data_t*>(reinterpret_cast<const data_t*>(x)),
                 stream,
-                {n});
+                {n, this->d});
+
+        if (ids) {
+            auto indices = toDeviceTemporary<idx_t, 1>(
+                    resources_.get(),
+                    config_.device,
+                    const_cast<idx_t*>(ids),
+                    stream,
+                    {n});
+
+            addImpl_(
+                    n,
+                    static_cast<const void*>(vecs.data()),
+                    numeric_type,
+                    ids ? indices.data() : nullptr);
+        } else {
+            addImpl_(
+                    n,
+                    static_cast<const void*>(vecs.data()),
+                    numeric_type,
+                    nullptr);
+        }
+    };
 
-        addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
+    if (numeric_type == NumericType::Float32) {
+        dispatch(float{});
+    } else if (numeric_type == NumericType::Float16) {
+        dispatch(half{});
     } else {
-        addImpl_(n, vecs.data(), nullptr);
+        FAISS_THROW_MSG("GpuIndex::addPage_: Unsupported numeric type");
     }
 }
 
+void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) {
+    addPage_(n, static_cast<const void*>(x), NumericType::Float32, ids);
+}
+
 void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
     DeviceScope scope(config_.device);
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
@@ -212,7 +279,8 @@ void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
 
 void GpuIndex::search(
         idx_t n,
-        const float* x,
+        const void* x,
+        NumericType numeric_type,
         idx_t k,
         float* distances,
         idx_t* labels,
@@ -253,17 +321,31 @@ void GpuIndex::search(
         // -> GPU.
         // Currently, we don't handle the case where the output data won't
         // fit on the GPU (e.g., n * k is too large for the GPU memory).
-        size_t dataSize = (size_t)n * this->d * sizeof(float);
+        size_t dataSize =
+                (size_t)n * this->d * get_numeric_type_size(numeric_type);
 
         if (dataSize >= minPagedSize_) {
             searchFromCpuPaged_(
-                    n, x, k, outDistances.data(), outLabels.data(), params);
+                    n,
+                    x,
+                    numeric_type,
+                    k,
+                    outDistances.data(),
+                    outLabels.data(),
+                    params);
             usePaged = true;
         }
     }
 
     if (!usePaged) {
-        searchNonPaged_(n, x, k, outDistances.data(), outLabels.data(), params);
+        searchNonPaged_(
+                n,
+                x,
+                numeric_type,
+                k,
+                outDistances.data(),
+                outLabels.data(),
+                params);
     }
 
     // Copy back if necessary
@@ -271,6 +353,22 @@ void GpuIndex::search(
     fromDevice<idx_t, 2>(outLabels, labels, stream);
 }
 
+void GpuIndex::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    search(n,
+           static_cast<const void*>(x),
+           NumericType::Float32,
+           k,
+           distances,
+           labels,
+           params);
+}
+
 void GpuIndex::search_and_reconstruct(
         idx_t n,
         const float* x,
@@ -285,7 +383,8 @@ void GpuIndex::search_and_reconstruct(
 
 void GpuIndex::searchNonPaged_(
         idx_t n,
-        const float* x,
+        const void* x,
+        NumericType numeric_type,
         int k,
         float* outDistancesData,
         idx_t* outIndicesData,
@@ -294,49 +393,103 @@ void GpuIndex::searchNonPaged_(
 
     // Make sure arguments are on the device we desire; use temporary
     // memory allocations to move it if necessary
-    auto vecs = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            const_cast<float*>(x),
-            stream,
-            {n, this->d});
-
-    searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, params);
+    if (numeric_type == NumericType::Float32) {
+        auto vecs = toDeviceTemporary<float, 2>(
+                resources_.get(),
+                config_.device,
+                const_cast<float*>(static_cast<const float*>(x)),
+                stream,
+                {n, this->d});
+
+        searchImpl_(
+                n, vecs.data(), k, outDistancesData, outIndicesData, params);
+    } else if (numeric_type == NumericType::Float16) {
+        auto vecs = toDeviceTemporary<half, 2>(
+                resources_.get(),
+                config_.device,
+                const_cast<half*>(static_cast<const half*>(x)),
+                stream,
+                {n, this->d});
+
+        searchImpl_(
+                n,
+                static_cast<const void*>(vecs.data()),
+                numeric_type,
+                k,
+                outDistancesData,
+                outIndicesData,
+                params);
+    } else {
+        FAISS_THROW_MSG("GpuIndex::search: Unsupported numeric type");
+    }
 }
 
-void GpuIndex::searchFromCpuPaged_(
+void GpuIndex::searchNonPaged_(
         idx_t n,
         const float* x,
         int k,
         float* outDistancesData,
         idx_t* outIndicesData,
         const SearchParameters* params) const {
+    searchNonPaged_(
+            n,
+            static_cast<const void*>(x),
+            NumericType::Float32,
+            k,
+            outDistancesData,
+            outIndicesData,
+            params);
+}
+
+void GpuIndex::searchFromCpuPaged_(
+        idx_t n,
+        const void* x,
+        NumericType numeric_type,
+        int k,
+        float* outDistancesData,
+        idx_t* outIndicesData,
+        const SearchParameters* params) const {
     Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
     Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
 
     // Is pinned memory available?
     auto pinnedAlloc = resources_->getPinnedMemory();
     idx_t pageSizeInVecs =
-            ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
+            ((pinnedAlloc.second / 2) /
+             (get_numeric_type_size(numeric_type) * this->d));
 
     if (!pinnedAlloc.first || pageSizeInVecs < 1) {
         // Just page without overlapping copy with compute
         idx_t batchSize = utils::nextHighestPowerOf2(
-                (kNonPinnedPageSize / (sizeof(float) * this->d)));
+                (kNonPinnedPageSize /
+                 (get_numeric_type_size(numeric_type) * this->d)));
 
         for (idx_t cur = 0; cur < n; cur += batchSize) {
             auto num = std::min(batchSize, n - cur);
 
             auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
             auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
-
-            searchNonPaged_(
-                    num,
-                    x + cur * this->d,
-                    k,
-                    outDistancesSlice.data(),
-                    outIndicesSlice.data(),
-                    params);
+            if (numeric_type == NumericType::Float32) {
+                searchNonPaged_(
+                        num,
+                        static_cast<const void*>(
+                                static_cast<const float*>(x) + cur * this->d),
+                        numeric_type,
+                        k,
+                        outDistancesSlice.data(),
+                        outIndicesSlice.data(),
+                        params);
+            } else if (numeric_type == NumericType::Float16) {
+                searchNonPaged_(
+                        num,
+                        static_cast<const void*>(
+                                static_cast<const half*>(x) + cur * this->d),
+                        numeric_type,
+                        k,
+                        outDistancesSlice.data(),
+                        outIndicesSlice.data(),
+                        params);
+            }
         }
 
         return;
@@ -359,127 +512,162 @@ void GpuIndex::searchFromCpuPaged_(
     auto defaultStream = resources_->getDefaultStream(config_.device);
     auto copyStream = resources_->getAsyncCopyStream(config_.device);
 
-    float* bufPinnedA = (float*)pinnedAlloc.first;
-    float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
-    float* bufPinned[2] = {bufPinnedA, bufPinnedB};
-
-    // Reserve space on the GPU for the destination of the pinned buffer
-    // copy
-    DeviceTensor<float, 2, true> bufGpuA(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, defaultStream),
-            {pageSizeInVecs, this->d});
-    DeviceTensor<float, 2, true> bufGpuB(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, defaultStream),
-            {pageSizeInVecs, this->d});
-    DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
-
-    // Copy completion events for the pinned buffers
-    std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
-
-    // Execute completion events for the GPU buffers
-    std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
-
-    // All offsets are in terms of number of vectors
-
-    // Current start offset for buffer 1
-    idx_t cur1 = 0;
-    idx_t cur1BufIndex = 0;
-
-    // Current start offset for buffer 2
-    idx_t cur2 = -1;
-    idx_t cur2BufIndex = 0;
-
-    // Current start offset for buffer 3
-    idx_t cur3 = -1;
-    idx_t cur3BufIndex = 0;
-
-    while (cur3 < n) {
-        // Start async pinned -> GPU copy first (buf 2)
-        if (cur2 != -1 && cur2 < n) {
-            // Copy pinned to GPU
-            auto numToCopy = std::min(pageSizeInVecs, n - cur2);
-
-            // Make sure any previous execution has completed before continuing
-            auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
-            if (eventPrev.get()) {
-                eventPrev->streamWaitOnEvent(copyStream);
-            }
-
-            CUDA_VERIFY(cudaMemcpyAsync(
-                    bufGpus[cur2BufIndex]->data(),
-                    bufPinned[cur2BufIndex],
-                    numToCopy * this->d * sizeof(float),
-                    cudaMemcpyHostToDevice,
-                    copyStream));
-
-            // Mark a completion event in this stream
-            eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
-
-            // We pick up from here
-            cur3 = cur2;
-            cur2 += numToCopy;
-            cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
-        }
-
-        if (cur3 != idx_t(-1) && cur3 < n) {
-            // Process on GPU
-            auto numToProcess = std::min(pageSizeInVecs, n - cur3);
-
-            // Make sure the previous copy has completed before continuing
-            auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
-            FAISS_ASSERT(eventPrev.get());
+    auto dispatch = [&](auto dummy_type) {
+        using data_t = decltype(dummy_type);
 
-            eventPrev->streamWaitOnEvent(defaultStream);
+        data_t* bufPinnedA = (data_t*)pinnedAlloc.first;
+        data_t* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
+        data_t* bufPinned[2] = {bufPinnedA, bufPinnedB};
 
-            // Create tensor wrappers
-            // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
-            //                                    {numToProcess, this->d});
-            auto outDistancesSlice =
-                    outDistances.narrowOutermost(cur3, numToProcess);
-            auto outIndicesSlice =
-                    outIndices.narrowOutermost(cur3, numToProcess);
-
-            searchImpl_(
-                    numToProcess,
-                    bufGpus[cur3BufIndex]->data(),
-                    k,
-                    outDistancesSlice.data(),
-                    outIndicesSlice.data(),
-                    params);
-
-            // Create completion event
-            eventGpuExecuteDone[cur3BufIndex].reset(
-                    new CudaEvent(defaultStream));
-
-            // We pick up from here
-            cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
-            cur3 += numToProcess;
-        }
-
-        if (cur1 < n) {
-            // Copy CPU mem to CPU pinned
-            auto numToCopy = std::min(pageSizeInVecs, n - cur1);
-
-            // Make sure any previous copy has completed before continuing
-            auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
-            if (eventPrev.get()) {
-                eventPrev->cpuWaitOnEvent();
+        // Reserve space on the GPU for the destination of the pinned buffer
+        // copy
+        DeviceTensor<data_t, 2, true> bufGpuA(
+                resources_.get(),
+                makeTempAlloc(AllocType::Other, defaultStream),
+                {pageSizeInVecs, this->d});
+        DeviceTensor<data_t, 2, true> bufGpuB(
+                resources_.get(),
+                makeTempAlloc(AllocType::Other, defaultStream),
+                {pageSizeInVecs, this->d});
+        DeviceTensor<data_t, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
+
+        // Copy completion events for the pinned buffers
+        std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
+
+        // Execute completion events for the GPU buffers
+        std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
+
+        // All offsets are in terms of number of vectors
+
+        // Current start offset for buffer 1
+        idx_t cur1 = 0;
+        idx_t cur1BufIndex = 0;
+
+        // Current start offset for buffer 2
+        idx_t cur2 = -1;
+        idx_t cur2BufIndex = 0;
+
+        // Current start offset for buffer 3
+        idx_t cur3 = -1;
+        idx_t cur3BufIndex = 0;
+
+        while (cur3 < n) {
+            // Start async pinned -> GPU copy first (buf 2)
+            if (cur2 != -1 && cur2 < n) {
+                // Copy pinned to GPU
+                auto numToCopy = std::min(pageSizeInVecs, n - cur2);
+
+                // Make sure any previous execution has completed before
+                // continuing
+                auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
+                if (eventPrev.get()) {
+                    eventPrev->streamWaitOnEvent(copyStream);
+                }
+
+                CUDA_VERIFY(cudaMemcpyAsync(
+                        bufGpus[cur2BufIndex]->data(),
+                        bufPinned[cur2BufIndex],
+                        numToCopy * this->d * sizeof(data_t),
+                        cudaMemcpyHostToDevice,
+                        copyStream));
+
+                // Mark a completion event in this stream
+                eventPinnedCopyDone[cur2BufIndex].reset(
+                        new CudaEvent(copyStream));
+
+                // We pick up from here
+                cur3 = cur2;
+                cur2 += numToCopy;
+                cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
             }
 
-            memcpy(bufPinned[cur1BufIndex],
-                   x + cur1 * this->d,
-                   numToCopy * this->d * sizeof(float));
+            if (cur3 != idx_t(-1) && cur3 < n) {
+                // Process on GPU
+                auto numToProcess = std::min(pageSizeInVecs, n - cur3);
+
+                // Make sure the previous copy has completed before continuing
+                auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
+                FAISS_ASSERT(eventPrev.get());
+
+                eventPrev->streamWaitOnEvent(defaultStream);
+
+                // Create tensor wrappers
+                // DeviceTensor<float, 2, true>
+                // input(bufGpus[cur3BufIndex]->data(),
+                //                                    {numToProcess, this->d});
+                auto outDistancesSlice =
+                        outDistances.narrowOutermost(cur3, numToProcess);
+                auto outIndicesSlice =
+                        outIndices.narrowOutermost(cur3, numToProcess);
+
+                searchImpl_(
+                        numToProcess,
+                        static_cast<const void*>(bufGpus[cur3BufIndex]->data()),
+                        numeric_type,
+                        k,
+                        outDistancesSlice.data(),
+                        outIndicesSlice.data(),
+                        params);
+
+                // Create completion event
+                eventGpuExecuteDone[cur3BufIndex].reset(
+                        new CudaEvent(defaultStream));
+
+                // We pick up from here
+                cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
+                cur3 += numToProcess;
+            }
 
-            // We pick up from here
-            cur2 = cur1;
-            cur1 += numToCopy;
-            cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
+            if (cur1 < n) {
+                // Copy CPU mem to CPU pinned
+                auto numToCopy = std::min(pageSizeInVecs, n - cur1);
+
+                // Make sure any previous copy has completed before continuing
+                auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
+                if (eventPrev.get()) {
+                    eventPrev->cpuWaitOnEvent();
+                }
+
+                memcpy(bufPinned[cur1BufIndex],
+                       static_cast<const data_t*>(x) + cur1 * this->d,
+                       numToCopy * this->d *
+                               get_numeric_type_size(numeric_type));
+
+                // We pick up from here
+                cur2 = cur1;
+                cur1 += numToCopy;
+                cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
+            }
         }
+    };
+
+    if (numeric_type == NumericType::Float32) {
+        dispatch(float{});
+    } else if (numeric_type == NumericType::Float16) {
+        dispatch(half{});
+    } else {
+        FAISS_THROW_MSG(
+                "GpuIndex::searchFromCpuPaged_: Unsupported numeric type");
     }
 }
 
+void GpuIndex::searchFromCpuPaged_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* outDistancesData,
+        idx_t* outIndicesData,
+        const SearchParameters* params) const {
+    searchFromCpuPaged_(
+            n,
+            static_cast<const void*>(x),
+            NumericType::Float32,
+            k,
+            outDistancesData,
+            outIndicesData,
+            params);
+}
+
 void GpuIndex::compute_residual(const float* x, float* residual, idx_t key)
         const {
     FAISS_THROW_MSG("compute_residual not implemented for this type of index");
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 33fd37158e..bdeb362d31 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -77,11 +77,17 @@ class GpuIndex : public faiss::Index {
     /// as needed
     /// Handles paged adds if the add set is too large; calls addInternal_
     void add(idx_t, const float* x) override;
+    void add(idx_t, const void* x, NumericType numeric_type) override;
 
     /// `x` and `ids` can be resident on the CPU or any GPU; copies are
     /// performed as needed
     /// Handles paged adds if the add set is too large; calls addInternal_
     void add_with_ids(idx_t n, const float* x, const idx_t* ids) override;
+    void add_with_ids(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            const idx_t* xids) override;
 
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
@@ -97,6 +103,14 @@ class GpuIndex : public faiss::Index {
             float* distances,
             idx_t* labels,
             const SearchParameters* params = nullptr) const override;
+    void search(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
 
     /// `x`, `distances` and `labels` and `recons` can be resident on the CPU or
     /// any GPU; copies are performed as needed
@@ -125,9 +139,23 @@ class GpuIndex : public faiss::Index {
    protected:
     /// Copy what we need from the CPU equivalent
     void copyFrom(const faiss::Index* index);
+    void copyFrom(const faiss::Index* index, NumericType numeric_type) {
+        if (numeric_type == NumericType::Float32) {
+            copyFrom(index, NumericType::Float32);
+        } else {
+            FAISS_THROW_MSG("GpuIndex::copyFrom: unsupported numeric type");
+        }
+    }
 
     /// Copy what we have to the CPU equivalent
     void copyTo(faiss::Index* index) const;
+    void copyTo(const faiss::Index* index, NumericType numeric_type) {
+        if (numeric_type == NumericType::Float32) {
+            copyTo(index, NumericType::Float32);
+        } else {
+            FAISS_THROW_MSG("GpuIndex::copyTo: unsupported numeric type");
+        }
+    }
 
     /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
     /// generate them sequentially based on the order in which the IDs are added
@@ -137,6 +165,18 @@ class GpuIndex : public faiss::Index {
     /// All data is guaranteed to be resident on our device
     virtual void addImpl_(idx_t n, const float* x, const idx_t* ids) = 0;
 
+    virtual void addImpl_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            const idx_t* ids) {
+        if (numeric_type == NumericType::Float32) {
+            addImpl_(n, static_cast<const float*>(x), ids);
+        } else {
+            FAISS_THROW_MSG("GpuIndex::addImpl_: unsupported numeric type");
+        }
+    };
+
     /// Overridden to actually perform the search
     /// All data is guaranteed to be resident on our device
     virtual void searchImpl_(
@@ -147,13 +187,44 @@ class GpuIndex : public faiss::Index {
             idx_t* labels,
             const SearchParameters* params) const = 0;
 
+    virtual void searchImpl_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params) const {
+        if (numeric_type == NumericType::Float32) {
+            searchImpl_(
+                    n,
+                    static_cast<const float*>(x),
+                    k,
+                    distances,
+                    labels,
+                    params);
+        } else {
+            FAISS_THROW_MSG("GpuIndex::searchImpl_: unsupported numeric type");
+        }
+    }
+
    private:
     /// Handles paged adds if the add set is too large, passes to
     /// addImpl_ to actually perform the add for the current page
     void addPaged_(idx_t n, const float* x, const idx_t* ids);
+    void addPaged_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            const idx_t* ids);
 
     /// Calls addImpl_ for a single page of GPU-resident data
     void addPage_(idx_t n, const float* x, const idx_t* ids);
+    void addPage_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            const idx_t* ids);
 
     /// Calls searchImpl_ for a single page of GPU-resident data
     void searchNonPaged_(
@@ -164,6 +235,15 @@ class GpuIndex : public faiss::Index {
             idx_t* outIndicesData,
             const SearchParameters* params) const;
 
+    void searchNonPaged_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            int k,
+            float* outDistancesData,
+            idx_t* outIndicesData,
+            const SearchParameters* params) const;
+
     /// Calls searchImpl_ for a single page of GPU-resident data,
     /// handling paging of the data and copies from the CPU
     void searchFromCpuPaged_(
@@ -173,6 +253,14 @@ class GpuIndex : public faiss::Index {
             float* outDistancesData,
             idx_t* outIndicesData,
             const SearchParameters* params) const;
+    void searchFromCpuPaged_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            int k,
+            float* outDistancesData,
+            idx_t* outIndicesData,
+            const SearchParameters* params) const;
 
    protected:
     /// Manages streams, cuBLAS handles and scratch memory for devices
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index e5ff7344cc..6bc4bc1cf5 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -27,6 +27,7 @@
 #include <cstddef>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
 #include <optional>
+#include <type_traits>
 
 namespace faiss {
 namespace gpu {
@@ -41,14 +42,18 @@ GpuIndexCagra::GpuIndexCagra(
     this->is_trained = false;
 }
 
-void GpuIndexCagra::train(idx_t n, const float* x) {
+void GpuIndexCagra::train(idx_t n, const void* x, NumericType numeric_type) {
+    numeric_type_ = numeric_type;
+    bool index_is_initialized = !std::holds_alternative<std::monostate>(index_);
+
     DeviceScope scope(config_.device);
     if (this->is_trained) {
-        FAISS_ASSERT(index_);
+        FAISS_ASSERT(index_is_initialized);
         return;
     }
 
-    FAISS_ASSERT(!index_);
+    // CuvsCagra not initialized
+    FAISS_ASSERT(!index_is_initialized);
 
     std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
             std::nullopt;
@@ -82,30 +87,61 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
         ivf_pq_search_params->preferred_shmem_carveout =
                 cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
     }
-    index_ = std::make_shared<CuvsCagra>(
-            this->resources_.get(),
-            this->d,
-            cagraConfig_.intermediate_graph_degree,
-            cagraConfig_.graph_degree,
-            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
-            cagraConfig_.nn_descent_niter,
-            cagraConfig_.store_dataset,
-            this->metric_type,
-            this->metric_arg,
-            INDICES_64_BIT,
-            ivf_pq_params,
-            ivf_pq_search_params,
-            cagraConfig_.refine_rate,
-            cagraConfig_.guarantee_connectivity);
-
-    index_->train(n, x);
+
+    if (numeric_type == NumericType::Float32) {
+        index_ = std::make_shared<CuvsCagra<float>>(
+                this->resources_.get(),
+                this->d,
+                cagraConfig_.intermediate_graph_degree,
+                cagraConfig_.graph_degree,
+                static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+                cagraConfig_.nn_descent_niter,
+                cagraConfig_.store_dataset,
+                this->metric_type,
+                this->metric_arg,
+                INDICES_64_BIT,
+                ivf_pq_params,
+                ivf_pq_search_params,
+                cagraConfig_.refine_rate,
+                cagraConfig_.guarantee_connectivity);
+        std::get<std::shared_ptr<CuvsCagra<float>>>(index_)->train(
+                n, static_cast<const float*>(x));
+    } else if (numeric_type == NumericType::Float16) {
+        index_ = std::make_shared<CuvsCagra<half>>(
+                this->resources_.get(),
+                this->d,
+                cagraConfig_.intermediate_graph_degree,
+                cagraConfig_.graph_degree,
+                static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+                cagraConfig_.nn_descent_niter,
+                cagraConfig_.store_dataset,
+                this->metric_type,
+                this->metric_arg,
+                INDICES_64_BIT,
+                ivf_pq_params,
+                ivf_pq_search_params,
+                cagraConfig_.refine_rate,
+                cagraConfig_.guarantee_connectivity);
+        std::get<std::shared_ptr<CuvsCagra<half>>>(index_)->train(
+                n, static_cast<const half*>(x));
+    } else {
+        FAISS_THROW_MSG("GpuIndexCagra::train unsupported data type");
+    }
 
     this->is_trained = true;
     this->ntotal = n;
 }
 
+void GpuIndexCagra::train(idx_t n, const float* x) {
+    train(n, static_cast<const void*>(x), NumericType::Float32);
+}
+
+void GpuIndexCagra::add(idx_t n, const void* x, NumericType numeric_type) {
+    train(n, x, numeric_type);
+}
+
 void GpuIndexCagra::add(idx_t n, const float* x) {
-    train(n, x);
+    add(n, x, NumericType::Float32);
 }
 
 bool GpuIndexCagra::addImplRequiresIDs_() const {
@@ -118,17 +154,19 @@ void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
 
 void GpuIndexCagra::searchImpl_(
         idx_t n,
-        const float* x,
+        const void* x,
+        NumericType numeric_type,
         int k,
         float* distances,
         idx_t* labels,
         const SearchParameters* search_params) const {
-    FAISS_ASSERT(this->is_trained && index_);
+    FAISS_ASSERT(
+            this->is_trained &&
+            !std::holds_alternative<std::monostate>(index_));
     FAISS_ASSERT(n > 0);
-
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
-    Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+    FAISS_THROW_IF_NOT_MSG(
+            numeric_type == numeric_type_,
+            "Inconsistent numeric type for train and search");
 
     SearchParametersCagra* params;
     if (search_params) {
@@ -138,41 +176,90 @@ void GpuIndexCagra::searchImpl_(
         params = new SearchParametersCagra{};
     }
 
-    index_->search(
-            queries,
-            k,
-            outDistances,
-            outLabels,
-            params->max_queries,
-            params->itopk_size,
-            params->max_iterations,
-            static_cast<faiss::cagra_search_algo>(params->algo),
-            params->team_size,
-            params->search_width,
-            params->min_iterations,
-            params->thread_block_size,
-            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
-            params->hashmap_min_bitlen,
-            params->hashmap_max_fill_rate,
-            params->num_random_samplings,
-            params->seed);
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+
+    if (numeric_type == NumericType::Float32) {
+        Tensor<float, 2, true> queries(
+                const_cast<float*>(static_cast<const float*>(x)), {n, this->d});
+
+        std::get<std::shared_ptr<CuvsCagra<float>>>(index_)->search(
+                queries,
+                k,
+                outDistances,
+                outLabels,
+                params->max_queries,
+                params->itopk_size,
+                params->max_iterations,
+                static_cast<faiss::cagra_search_algo>(params->algo),
+                params->team_size,
+                params->search_width,
+                params->min_iterations,
+                params->thread_block_size,
+                static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+                params->hashmap_min_bitlen,
+                params->hashmap_max_fill_rate,
+                params->num_random_samplings,
+                params->seed);
+
+    } else if (numeric_type == NumericType::Float16) {
+        Tensor<half, 2, true> queries(
+                const_cast<half*>(static_cast<const half*>(x)), {n, this->d});
+
+        std::get<std::shared_ptr<CuvsCagra<half>>>(index_)->search(
+                queries,
+                k,
+                outDistances,
+                outLabels,
+                params->max_queries,
+                params->itopk_size,
+                params->max_iterations,
+                static_cast<faiss::cagra_search_algo>(params->algo),
+                params->team_size,
+                params->search_width,
+                params->min_iterations,
+                params->thread_block_size,
+                static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+                params->hashmap_min_bitlen,
+                params->hashmap_max_fill_rate,
+                params->num_random_samplings,
+                params->seed);
+    } else {
+        FAISS_THROW_MSG("GpuIndexCagra::searchImpl_ unsupported data type");
+    }
 
     if (not search_params) {
         delete params;
     }
 }
 
-void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+void GpuIndexCagra::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* search_params) const {
+    searchImpl_(
+            n,
+            static_cast<const void*>(x),
+            NumericType::Float32,
+            k,
+            distances,
+            labels,
+            search_params);
+}
+
+void GpuIndexCagra::copyFrom(
+        const faiss::IndexHNSWCagra* index,
+        NumericType numeric_type) {
     FAISS_ASSERT(index);
+    numeric_type_ = numeric_type;
 
     DeviceScope scope(config_.device);
 
     GpuIndex::copyFrom(index);
 
-    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
-    FAISS_ASSERT(base_index);
-    auto distances = base_index->get_xb();
-
     auto hnsw = index->hnsw;
     // copy level 0 to a dense knn graph matrix
     std::vector<idx_t> knn_graph;
@@ -189,23 +276,51 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
         }
     }
 
-    index_ = std::make_shared<CuvsCagra>(
-            this->resources_.get(),
-            this->d,
-            index->ntotal,
-            hnsw.nb_neighbors(0),
-            distances,
-            knn_graph.data(),
-            this->metric_type,
-            this->metric_arg,
-            INDICES_64_BIT);
+    if (numeric_type == NumericType::Float32) {
+        auto base_index = dynamic_cast<IndexFlat*>(index->storage);
+        FAISS_ASSERT(base_index);
+        auto dataset = base_index->get_xb();
+
+        index_ = std::make_shared<CuvsCagra<float>>(
+                this->resources_.get(),
+                this->d,
+                index->ntotal,
+                hnsw.nb_neighbors(0),
+                dataset,
+                knn_graph.data(),
+                this->metric_type,
+                this->metric_arg,
+                INDICES_64_BIT);
+    } else if (numeric_type == NumericType::Float16) {
+        auto base_index = dynamic_cast<IndexScalarQuantizer*>(index->storage);
+        FAISS_ASSERT(base_index);
+        auto dataset = (half*)base_index->codes.data();
+
+        index_ = std::make_shared<CuvsCagra<half>>(
+                this->resources_.get(),
+                this->d,
+                index->ntotal,
+                hnsw.nb_neighbors(0),
+                dataset,
+                knn_graph.data(),
+                this->metric_type,
+                this->metric_arg,
+                INDICES_64_BIT);
+    } else {
+        FAISS_THROW_MSG("GpuIndexCagra::copyFrom unsupported data type");
+    }
 
     this->is_trained = true;
 }
 
-void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
-    FAISS_ASSERT(index_ && this->is_trained && index);
+void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+    copyFrom(index, NumericType::Float32);
+}
 
+void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
+    FAISS_ASSERT(
+            !std::holds_alternative<std::monostate>(index_) &&
+            this->is_trained && index);
     DeviceScope scope(config_.device);
 
     //
@@ -215,18 +330,38 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     // This needs to be zeroed out as this implementation adds vectors to the
     // cpuIndex instead of copying fields
     index->ntotal = 0;
+    index->set_numeric_type(numeric_type_);
+
+    idx_t graph_degree;
+
+    if (numeric_type_ == NumericType::Float32) {
+        graph_degree = std::get<std::shared_ptr<CuvsCagra<float>>>(index_)
+                               ->get_knngraph_degree();
+    } else if (numeric_type_ == NumericType::Float16) {
+        graph_degree = std::get<std::shared_ptr<CuvsCagra<half>>>(index_)
+                               ->get_knngraph_degree();
+    } else {
+        FAISS_THROW_MSG("GpuIndexCagra::copyTo unsupported data type");
+    }
 
-    auto graph_degree = index_->get_knngraph_degree();
     auto M = graph_degree / 2;
     if (index->storage and index->own_fields) {
         delete index->storage;
     }
 
-    if (this->metric_type == METRIC_L2) {
-        index->storage = new IndexFlatL2(index->d);
-    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
-        index->storage = new IndexFlatIP(index->d);
+    // storage depends on numerictype
+    if (numeric_type_ == NumericType::Float32) {
+        if (this->metric_type == METRIC_L2) {
+            index->storage = new IndexFlatL2(index->d);
+        } else if (this->metric_type == METRIC_INNER_PRODUCT) {
+            index->storage = new IndexFlatIP(index->d);
+        }
+    } else if (numeric_type_ == NumericType::Float16) {
+        auto qtype = ScalarQuantizer::QT_fp16;
+        index->storage =
+                new IndexScalarQuantizer(index->d, qtype, this->metric_type);
     }
+
     index->own_fields = true;
     index->keep_max_size_level0 = true;
     index->hnsw.reset();
@@ -235,32 +370,69 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
-    float* train_dataset;
-    auto dataset = index_->get_training_dataset();
     bool allocation = false;
-    if (getDeviceForAddress(dataset) >= 0) {
-        train_dataset = new float[n_train * index->d];
-        allocation = true;
-        raft::copy(
-                train_dataset,
-                dataset,
-                n_train * index->d,
-                this->resources_->getRaftHandleCurrentDevice().get_stream());
-    } else {
-        train_dataset = const_cast<float*>(dataset);
-    }
 
-    // turn off as level 0 is copied from CAGRA graph
-    index->init_level0 = false;
-    if (!index->base_level_only) {
-        index->add(n_train, train_dataset);
-    } else {
-        index->hnsw.prepare_level_tab(n_train, false);
-        index->storage->add(n_train, train_dataset);
-        index->ntotal = n_train;
-    }
-    if (allocation) {
-        delete[] train_dataset;
+    if (numeric_type_ == NumericType::Float32) {
+        float* train_dataset;
+        const float* dataset =
+                std::get<std::shared_ptr<CuvsCagra<float>>>(index_)
+                        ->get_training_dataset();
+        if (getDeviceForAddress(dataset) >= 0) {
+            train_dataset = new float[n_train * index->d];
+            allocation = true;
+            raft::copy(
+                    train_dataset,
+                    dataset,
+                    n_train * index->d,
+                    this->resources_->getRaftHandleCurrentDevice()
+                            .get_stream());
+        } else {
+            train_dataset = const_cast<float*>(dataset);
+        }
+
+        // turn off as level 0 is copied from CAGRA graph
+        index->init_level0 = false;
+        if (!index->base_level_only) {
+            index->add(n_train, train_dataset);
+        } else {
+            index->hnsw.prepare_level_tab(n_train, false);
+            index->storage->add(n_train, train_dataset);
+            index->ntotal = n_train;
+        }
+        if (allocation) {
+            delete[] train_dataset;
+        }
+    } else if (numeric_type_ == NumericType::Float16) {
+        half* train_dataset;
+        const half* dataset = std::get<std::shared_ptr<CuvsCagra<half>>>(index_)
+                                      ->get_training_dataset();
+        if (getDeviceForAddress(dataset) >= 0) {
+            train_dataset = new half[n_train * index->d];
+            allocation = true;
+            raft::copy(
+                    train_dataset,
+                    dataset,
+                    n_train * index->d,
+                    this->resources_->getRaftHandleCurrentDevice()
+                            .get_stream());
+        } else {
+            train_dataset = const_cast<half*>(dataset);
+        }
+
+        index->init_level0 = false;
+        if (!index->base_level_only) {
+            FAISS_THROW_MSG(
+                    "Only base level copy is supported for FP16 types in GpuIndexCagra::copyTo");
+        } else {
+            index->hnsw.prepare_level_tab(n_train, false);
+            index->storage->add_sa_codes(
+                    n_train, (uint8_t*)train_dataset, nullptr);
+            index->ntotal = n_train;
+        }
+
+        if (allocation) {
+            delete[] train_dataset;
+        }
     }
 
     auto graph = get_knngraph();
@@ -281,8 +453,18 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
 void GpuIndexCagra::reset() {
     DeviceScope scope(config_.device);
 
-    if (index_) {
-        index_->reset();
+    if (!std::holds_alternative<std::monostate>(index_)) {
+        std::visit(
+                [](auto& index_ptr) {
+                    using IndexPtrT = std::decay_t<decltype(index_ptr)>;
+                    if constexpr (std::is_same_v<IndexPtrT, std::monostate>) {
+                        FAISS_THROW_MSG(
+                                "CuvsCagra not initialized when calling GpuIndexCagra::reset");
+                    } else {
+                        return index_ptr->reset();
+                    }
+                },
+                index_);
         this->ntotal = 0;
         this->is_trained = false;
     } else {
@@ -291,9 +473,26 @@ void GpuIndexCagra::reset() {
 }
 
 std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
-    FAISS_ASSERT(index_ && this->is_trained);
+    FAISS_ASSERT(
+            !std::holds_alternative<std::monostate>(index_) &&
+            this->is_trained);
+
+    return std::visit(
+            [](auto&& index_ptr) -> std::vector<idx_t> {
+                using IndexPtrT = std::decay_t<decltype(index_ptr)>;
+
+                if constexpr (std::is_same_v<IndexPtrT, std::monostate>) {
+                    FAISS_THROW_MSG(
+                            "CuvsCagra not initialized when calling GpuIndexCagra::get_knngraph");
+                } else {
+                    return index_ptr->get_knngraph();
+                }
+            },
+            index_);
+}
 
-    return index_->get_knngraph();
+faiss::NumericType GpuIndexCagra::get_numeric_type() const {
+    return numeric_type_;
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index bbf74cf5ee..cf4a706e7d 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -27,6 +27,9 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
 
+#include <variant>
+#include "faiss/Index.h"
+
 namespace faiss {
 struct IndexHNSWCagra;
 }
@@ -34,6 +37,7 @@ struct IndexHNSWCagra;
 namespace faiss {
 namespace gpu {
 
+template <typename data_t>
 class CuvsCagra;
 
 enum class graph_build_algo {
@@ -252,6 +256,7 @@ struct GpuIndexCagra : public GpuIndex {
     /// the base dataset. Use this function when you want to add vectors with
     /// ids. Ref: https://github.com/facebookresearch/faiss/issues/4107
     void add(idx_t n, const float* x) override;
+    void add(idx_t n, const void* x, NumericType numeric_type) override;
 
     /// Trains CAGRA based on the given vector data.
     /// NB: The use of the train function here is to build the CAGRA graph on
@@ -259,10 +264,12 @@ struct GpuIndexCagra : public GpuIndex {
     /// of vectors (without IDs) to the index. There is no external quantizer to
     /// be trained here.
     void train(idx_t n, const float* x) override;
+    void train(idx_t n, const void* x, NumericType numeric_type) override;
 
     /// Initialize ourselves from the given CPU index; will overwrite
     /// all data in ourselves
     void copyFrom(const faiss::IndexHNSWCagra* index);
+    void copyFrom(const faiss::IndexHNSWCagra* index, NumericType numeric_type);
 
     /// Copy ourselves to the given CPU index; will overwrite all data
     /// in the index instance
@@ -272,6 +279,8 @@ struct GpuIndexCagra : public GpuIndex {
 
     std::vector<idx_t> get_knngraph() const;
 
+    faiss::NumericType get_numeric_type() const;
+
    protected:
     bool addImplRequiresIDs_() const override;
 
@@ -285,12 +294,26 @@ struct GpuIndexCagra : public GpuIndex {
             float* distances,
             idx_t* labels,
             const SearchParameters* search_params) const override;
+    void searchImpl_(
+            idx_t n,
+            const void* x,
+            NumericType numeric_type,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
 
     /// Our configuration options
     const GpuIndexCagraConfig cagraConfig_;
 
+    faiss::NumericType numeric_type_;
+
     /// Instance that we own; contains the inverted lists
-    std::shared_ptr<CuvsCagra> index_;
+    std::variant<
+            std::monostate,
+            std::shared_ptr<CuvsCagra<float>>,
+            std::shared_ptr<CuvsCagra<half>>>
+            index_;
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 22009d2852..acac6bcbbb 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -34,7 +34,8 @@
 namespace faiss {
 namespace gpu {
 
-CuvsCagra::CuvsCagra(
+template <typename data_t>
+CuvsCagra<data_t>::CuvsCagra(
         GpuResources* resources,
         int dim,
         idx_t intermediate_graph_degree,
@@ -83,12 +84,13 @@ CuvsCagra::CuvsCagra(
     reset();
 }
 
-CuvsCagra::CuvsCagra(
+template <typename data_t>
+CuvsCagra<data_t>::CuvsCagra(
         GpuResources* resources,
         int dim,
         idx_t n,
         int graph_degree,
-        const float* distances,
+        const data_t* dataset,
         const idx_t* knn_graph,
         faiss::MetricType metric,
         float metricArg,
@@ -104,18 +106,18 @@ CuvsCagra::CuvsCagra(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
             "only INDICES_64_BIT is supported for cuVS CAGRA index");
 
-    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto dataset_on_gpu = getDeviceForAddress(dataset) >= 0;
     auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
 
-    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+    FAISS_ASSERT(dataset_on_gpu == knn_graph_on_gpu);
 
-    storage_ = distances;
+    storage_ = dataset;
     n_ = n;
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    if (distances_on_gpu && knn_graph_on_gpu) {
+    if (dataset_on_gpu && knn_graph_on_gpu) {
         raft_handle.sync_stream();
         // Copying to host so that cuvs::neighbors::cagra::index
         // creates an owning copy of the knn graph on device
@@ -126,17 +128,16 @@ CuvsCagra::CuvsCagra(
                 thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
                 knn_graph_copy.data_handle());
 
-        auto distances_mds =
-                raft::make_device_matrix_view<const float, int64_t>(
-                        distances, n, dim);
+        auto dataset_mds = raft::make_device_matrix_view<const data_t, int64_t>(
+                dataset, n, dim);
 
         cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::index<data_t, uint32_t>>(
                 raft_handle,
                 metricFaissToCuvs(metric_, false),
-                distances_mds,
+                dataset_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
-    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+    } else if (!dataset_on_gpu && !knn_graph_on_gpu) {
         // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
         auto knn_graph_copy =
                 raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
@@ -145,22 +146,23 @@ CuvsCagra::CuvsCagra(
                 knn_graph + (n * graph_degree),
                 knn_graph_copy.data_handle());
 
-        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
-                distances, n, dim);
+        auto dataset_mds = raft::make_host_matrix_view<const data_t, int64_t>(
+                dataset, n, dim);
 
         cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::index<data_t, uint32_t>>(
                 raft_handle,
                 metricFaissToCuvs(metric_, false),
-                distances_mds,
+                dataset_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else {
         FAISS_THROW_MSG(
-                "distances and knn_graph must both be in device or host memory");
+                "dataset and knn_graph must both be in device or host memory");
     }
 }
 
-void CuvsCagra::train(idx_t n, const float* x) {
+template <typename data_t>
+void CuvsCagra<data_t>::train(idx_t n, const data_t* x) {
     storage_ = x;
     n_ = n;
 
@@ -193,24 +195,25 @@ void CuvsCagra::train(idx_t n, const float* x) {
     }
 
     if (getDeviceForAddress(x) >= 0) {
-        auto dataset =
-                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
+        auto dataset = raft::make_device_matrix_view<const data_t, int64_t>(
+                x, n, dim_);
         cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::index<data_t, uint32_t>>(
                 cuvs::neighbors::cagra::build(
                         raft_handle, index_params_, dataset));
     } else {
         auto dataset =
-                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
+                raft::make_host_matrix_view<const data_t, int64_t>(x, n, dim_);
         cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::index<data_t, uint32_t>>(
                 cuvs::neighbors::cagra::build(
                         raft_handle, index_params_, dataset));
     }
 }
 
-void CuvsCagra::search(
-        Tensor<float, 2, true>& queries,
+template <typename data_t>
+void CuvsCagra<data_t>::search(
+        Tensor<data_t, 2, true>& queries,
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices,
@@ -239,18 +242,18 @@ void CuvsCagra::search(
 
     if (!store_dataset_) {
         if (getDeviceForAddress(storage_) >= 0) {
-            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
+            auto dataset = raft::make_device_matrix_view<const data_t, int64_t>(
                     storage_, n_, dim_);
             cuvs_index->update_dataset(raft_handle, dataset);
         } else {
-            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
+            auto dataset = raft::make_host_matrix_view<const data_t, int64_t>(
                     storage_, n_, dim_);
             cuvs_index->update_dataset(raft_handle, dataset);
         }
         store_dataset_ = true;
     }
 
-    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
+    auto queries_view = raft::make_device_matrix_view<const data_t, int64_t>(
             queries.data(), numQueries, cols);
     auto distances_view = raft::make_device_matrix_view<float, int64_t>(
             outDistances.data(), numQueries, k_);
@@ -291,16 +294,19 @@ void CuvsCagra::search(
             indices_view.data_handle());
 }
 
-void CuvsCagra::reset() {
+template <typename data_t>
+void CuvsCagra<data_t>::reset() {
     cuvs_index.reset();
 }
 
-idx_t CuvsCagra::get_knngraph_degree() const {
+template <typename data_t>
+idx_t CuvsCagra<data_t>::get_knngraph_degree() const {
     FAISS_ASSERT(cuvs_index);
     return static_cast<idx_t>(cuvs_index->graph_degree());
 }
 
-std::vector<idx_t> CuvsCagra::get_knngraph() const {
+template <typename data_t>
+std::vector<idx_t> CuvsCagra<data_t>::get_knngraph() const {
     FAISS_ASSERT(cuvs_index);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -322,9 +328,12 @@ std::vector<idx_t> CuvsCagra::get_knngraph() const {
     return host_graph;
 }
 
-const float* CuvsCagra::get_training_dataset() const {
+template <typename data_t>
+const data_t* CuvsCagra<data_t>::get_training_dataset() const {
     return storage_;
 }
 
+template class CuvsCagra<float>;
+template class CuvsCagra<half>;
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
index 5202c4251c..a10e9fba61 100644
--- a/faiss/gpu/impl/CuvsCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -45,6 +45,7 @@ enum class cagra_hash_mode { HASH, SMALL, AUTO };
 
 namespace gpu {
 
+template <typename data_t = float>
 class CuvsCagra {
    public:
     CuvsCagra(
@@ -70,7 +71,7 @@ class CuvsCagra {
             int dim,
             idx_t n,
             int graph_degree,
-            const float* distances,
+            const data_t* dataset,
             const idx_t* knn_graph,
             faiss::MetricType metric,
             float metricArg,
@@ -78,10 +79,10 @@ class CuvsCagra {
 
     ~CuvsCagra() = default;
 
-    void train(idx_t n, const float* x);
+    void train(idx_t n, const data_t* x);
 
     void search(
-            Tensor<float, 2, true>& queries,
+            Tensor<data_t, 2, true>& queries,
             int k,
             Tensor<float, 2, true>& outDistances,
             Tensor<idx_t, 2, true>& outIndices,
@@ -105,14 +106,14 @@ class CuvsCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    const float* get_training_dataset() const;
+    const data_t* get_training_dataset() const;
 
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
 
     /// Training dataset
-    const float* storage_;
+    const data_t* storage_;
     int n_;
 
     /// Expected dimensionality of the vectors
@@ -147,9 +148,8 @@ class CuvsCagra {
     bool guarantee_connectivity_ = false;
 
     /// Instance of trained cuVS CAGRA index
-    std::shared_ptr<cuvs::neighbors::cagra::index<float, uint32_t>> cuvs_index{
+    std::shared_ptr<cuvs::neighbors::cagra::index<data_t, uint32_t>> cuvs_index{
             nullptr};
 };
-
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 16b94703c4..31dcfc03c9 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -21,6 +21,7 @@
  * limitations under the License.
  */
 
+#include <cuda_fp16.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndexCagra.h>
@@ -195,6 +196,142 @@ TEST(TestGpuIndexCagra, Float32_Query_IP) {
     queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
 }
 
+void queryTestFP16(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // train gpu index
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
+
+        // Create half vector
+        std::vector<__half> trainVecs_half(trainVecs.size());
+
+        for (size_t i = 0; i < trainVecs.size(); ++i) {
+            trainVecs_half[i] = __float2half(trainVecs[i]);
+        }
+
+        gpuIndex.train(
+                opt.numTrain,
+                static_cast<void*>(trainVecs_half.data()),
+                faiss::NumericType::Float16);
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        // test quality of searches
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        // Create half vector
+        std::vector<__half> queryVecs_half(queryVecs.size());
+
+        for (size_t i = 0; i < queryVecs.size(); ++i) {
+            queryVecs_half[i] = __float2half(queryVecs[i]);
+        }
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs_half.data(),
+                faiss::NumericType::Float16,
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                test_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                test_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float16_Query_L2) {
+    queryTestFP16(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float16_Query_IP) {
+    queryTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
 void copyToTest(
         faiss::MetricType metric,
         double expected_recall,
@@ -351,6 +488,158 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
     copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
 }
 
+void copyToTestFP16(
+        faiss::MetricType metric,
+        double expected_recall,
+        bool base_level_only) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
+        }
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // train gpu index and copy to cpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        // Create half vector
+        std::vector<__half> trainVecs_half(trainVecs.size());
+
+        for (size_t i = 0; i < trainVecs.size(); ++i) {
+            trainVecs_half[i] = __float2half(trainVecs[i]);
+        }
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(
+                opt.numTrain,
+                static_cast<void*>(trainVecs_half.data()),
+                faiss::NumericType::Float16);
+
+        faiss::IndexHNSWCagra copiedCpuIndex(
+                opt.dim,
+                opt.graphDegree / 2,
+                metric,
+                faiss::NumericType::Float16);
+        copiedCpuIndex.base_level_only = base_level_only;
+        gpuIndex.copyTo(&copiedCpuIndex);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // query indexes
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParamstwo;
+        cpuSearchParamstwo.efSearch = opt.k * 2;
+        copiedCpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyRefDistance.data(),
+                copyRefIndices.data(),
+                &cpuSearchParamstwo);
+
+        // test quality of search
+        auto gpuRes = res.getResources();
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                copyRefDistanceDev.data(), opt.numQuery, opt.k);
+        auto copy_ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_ref_dis_mds);
+        auto copy_ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_ref_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                copy_ref_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+// For fp16, only base level copy is supported
+TEST(TestGpuIndexCagra, Float16_CopyTo_L2_BaseLevelOnly) {
+    copyToTestFP16(faiss::METRIC_L2, 0.98, true);
+}
+
+TEST(TestGpuIndexCagra, Float16_CopyTo_IP_BaseLevelOnly) {
+    copyToTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98, true);
+}
+
 void copyFromTest(faiss::MetricType metric, double expected_recall) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
@@ -465,6 +754,147 @@ TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
     copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
 }
 
+void copyFromTestFP16(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWCagra cpuIndex(
+                opt.dim,
+                opt.graphDegree / 2,
+                metric,
+                faiss::NumericType::Float16);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // convert to gpu index
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
+        copiedGpuIndex.copyFrom(&cpuIndex, faiss::NumericType::Float16);
+
+        // train gpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        // faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        // gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
+
+        // Create half vector
+        std::vector<__half> trainVecs_half(trainVecs.size());
+
+        for (size_t i = 0; i < trainVecs.size(); ++i) {
+            trainVecs_half[i] = __float2half(trainVecs[i]);
+        }
+
+        gpuIndex.train(
+                opt.numTrain,
+                static_cast<void*>(trainVecs_half.data()),
+                faiss::NumericType::Float16);
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        // Create half vector
+        std::vector<__half> queryVecs_half(queryVecs.size());
+
+        for (size_t i = 0; i < queryVecs.size(); ++i) {
+            queryVecs_half[i] = __float2half(queryVecs[i]);
+        }
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        copiedGpuIndex.search(
+                opt.numQuery,
+                queryVecs_half.data(),
+                faiss::NumericType::Float16,
+                opt.k,
+                copyTestDistance.data(),
+                copyTestIndices.data());
+
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs_half.data(),
+                faiss::NumericType::Float16,
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        // test quality of searches
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        auto copy_test_dis_mds =
+                raft::make_device_matrix_view<const float, int>(
+                        copyTestDistance.data(), opt.numQuery, opt.k);
+        auto copy_test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_test_dis_mds);
+
+        auto copy_test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyTestIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_test_ind_mds,
+                test_ind_mds,
+                recall_score.view(),
+                copy_test_dis_mds_opt,
+                test_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float16_CopyFrom_L2) {
+    copyFromTestFP16(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float16_CopyFrom_IP) {
+    copyFromTestFP16(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
index 164f700e9f..9c9297c888 100644
--- a/faiss/gpu/test/test_cagra.py
+++ b/faiss/gpu/test/test_cagra.py
@@ -8,7 +8,7 @@
 import faiss
 
 from faiss.contrib import datasets, evaluation
-
+import numpy as np
 
 @unittest.skipIf(
     "CUVS" not in faiss.get_compile_options(),
@@ -42,6 +42,40 @@ def test_compute_GT_L2(self):
     def test_compute_GT_IP(self):
         self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
 
+@unittest.skipIf(
+    "CUVS" not in faiss.get_compile_options(),
+    "only if cuVS is compiled in")
+class TestComputeGTFP16(unittest.TestCase):
+
+    def do_compute_GT(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+        Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric)
+
+        res = faiss.StandardGpuResources()
+
+        # attempt to set custom IVF-PQ params
+        cagraIndexConfig = faiss.GpuIndexCagraConfig()
+        cagraIndexIVFPQConfig = faiss.IVFPQBuildCagraConfig()
+        cagraIndexIVFPQConfig.kmeans_trainset_fraction = 0.1
+        cagraIndexConfig.ivf_pq_params = cagraIndexIVFPQConfig
+        cagraIndexConfig.build_algo = faiss.graph_build_algo_IVF_PQ
+
+        index = faiss.GpuIndexCagra(res, d, metric, cagraIndexConfig)
+        fp16_data = ds.get_database().astype(np.float16)
+        index.train(fp16_data, faiss.Float16)
+        fp16_queries = ds.get_queries().astype(np.float16)
+        Dnew, Inew = index.search(fp16_queries, k, numeric_type=faiss.Float16)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+    def test_compute_GT_L2(self):
+        self.do_compute_GT(faiss.METRIC_L2)
+
+    def test_compute_GT_IP(self):
+        self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
 @unittest.skipIf(
     "CUVS" not in faiss.get_compile_options(),
     "only if cuVS is compiled in")
@@ -76,3 +110,40 @@ def test_interop_L2(self):
 
     def test_interop_IP(self):
         self.do_interop(faiss.METRIC_INNER_PRODUCT)
+
+@unittest.skipIf(
+    "CUVS" not in faiss.get_compile_options(),
+    "only if cuVS is compiled in")
+class TestInteropFP16(unittest.TestCase):
+
+    def do_interop(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        fp16_data = ds.get_database().astype(np.float16)
+        index.train(fp16_data, faiss.Float16)
+        fp16_queries = ds.get_queries().astype(np.float16)
+        Dnew, Inew = index.search(fp16_queries, k, numeric_type=faiss.Float16)
+
+        cpu_index = faiss.index_gpu_to_cpu(index)
+        Dref, Iref = cpu_index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+        deserialized_index = faiss.deserialize_index(
+            faiss.serialize_index(cpu_index))
+
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
+        Dnew2, Inew2 = gpu_index.search(fp16_queries, k, numeric_type=faiss.Float16)
+
+        evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k)
+
+    def test_interop_L2(self):
+        self.do_interop(faiss.METRIC_L2)
+
+    def test_interop_IP(self):
+        self.do_interop(faiss.METRIC_INNER_PRODUCT)
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 775e8b951c..2da075227e 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -1118,6 +1118,7 @@ Index* read_index(IOReader* f, int io_flags) {
             auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
             READ1(idx_hnsw_cagra->base_level_only);
             READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+            READ1(idx_hnsw_cagra->numeric_type_);
         }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 5b65454fe3..02bb3a03a3 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -781,6 +781,7 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
             auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
             WRITE1(idx_hnsw_cagra->base_level_only);
             WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+            WRITE1(idx_hnsw_cagra->numeric_type_);
         }
         write_HNSW(&idxhnsw->hnsw, f);
         if (io_flags & IO_FLAG_SKIP_STORAGE) {
diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 2491aa8914..51d8f570cb 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -211,7 +211,7 @@ def replacement_build(self, x, graph):
 
 def handle_Index(the_class):
 
-    def replacement_add(self, x):
+    def replacement_add(self, x, numeric_type = faiss.Float32):
         """Adds vectors to the index.
         The index must be trained before vectors can be added to it.
         The vectors are implicitly numbered in sequence. When `n` vectors are
@@ -226,7 +226,10 @@ def replacement_add(self, x):
 
         n, d = x.shape
         assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
+        if numeric_type == faiss.Float32:
+            x = np.ascontiguousarray(x, dtype='float32')
+        else:
+            x = np.ascontiguousarray(x, dtype='float16')
         self.add_c(n, swig_ptr(x))
 
     def replacement_add_with_ids(self, x, ids):
@@ -282,7 +285,7 @@ def replacement_assign(self, x, k, labels=None):
         self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
         return labels
 
-    def replacement_train(self, x):
+    def replacement_train(self, x, numeric_type = faiss.Float32):
         """Trains the index on a representative set of vectors.
         The index must be trained before vectors can be added to it.
 
@@ -294,10 +297,15 @@ def replacement_train(self, x):
         """
         n, d = x.shape
         assert d == self.d
-        x = np.ascontiguousarray(x, dtype='float32')
-        self.train_c(n, swig_ptr(x))
+        if numeric_type == faiss.Float32:
+            x = np.ascontiguousarray(x, dtype='float32')
+            self.train_c(n, swig_ptr(x))
+        else:
+            x = np.ascontiguousarray(x, dtype='float16')
+            self.train_c(n, swig_ptr(x), faiss.Float16)
+        
 
-    def replacement_search(self, x, k, *, params=None, D=None, I=None):
+    def replacement_search(self, x, k, *, params=None, D=None, I=None, numeric_type = faiss.Float32):
         """Find the k nearest neighbors of the set of vectors x in the index.
 
         Parameters
@@ -325,7 +333,10 @@ def replacement_search(self, x, k, *, params=None, D=None, I=None):
         """
 
         n, d = x.shape
-        x = np.ascontiguousarray(x, dtype='float32')
+        if numeric_type == faiss.Float32:
+            x = np.ascontiguousarray(x, dtype='float32')
+        else:
+            x = np.ascontiguousarray(x, dtype='float16')
         assert d == self.d
 
         assert k > 0
@@ -340,7 +351,10 @@ def replacement_search(self, x, k, *, params=None, D=None, I=None):
         else:
             assert I.shape == (n, k)
 
-        self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params)
+        if numeric_type == faiss.Float32:
+            self.search_c(n, swig_ptr(x), k, swig_ptr(D), swig_ptr(I), params)
+        else:
+            self.search_c(n, swig_ptr(x), faiss.Float16, k, swig_ptr(D), swig_ptr(I), params)
         return D, I
 
     def replacement_search_and_reconstruct(self, x, k, *, params=None, D=None, I=None, R=None):