Add HNSWPQ4Bits example (#176)

Patrick-H-Chen · rootlesstree · web-flow · commit 6ebc640742a6 · 2022-09-15T11:13:47.000-07:00
Co-authored-by: Patrick &lt;pitodogo@gmail.com&gt;
diff --git a/examples/ann-hnsw-pq4bits/Makefile b/examples/ann-hnsw-pq4bits/Makefile
@@ -0,0 +1,11 @@
+CXX=g++
+CXXFLAGS=-fopenmp -O3 -std=c++14 -fPIC -DNDEBUG -Wall -g -lblas
+EXTRA_INCLUDE_FLAGS=-I../../pecos/core/
+ARCHFLAG=-march=native 
+
+all: go
+
+go: example.cpp
+	${CXX} -o go ${CXXFLAGS} example.cpp -I. ${EXTRA_INCLUDE_FLAGS} ${ARCHFLAG}
+clean:
+	rm -rf *.so *.o go
diff --git a/examples/ann-hnsw-pq4bits/README.md b/examples/ann-hnsw-pq4bits/README.md
@@ -0,0 +1,75 @@
+
+
+## Notice
+- Currently we only support L2 distance with 4 Bits Product Quantization. 
+- We are working on extending to angular and ip distance measures.
+
+## Install Prerequisite
+
+To run this project, prerequisite is the same as building PECOS.
+
+* For Ubuntu (18.04, 20.04):
+``` bash
+sudo apt-get update && sudo apt-get install -y build-essential git python3 python3-distutils python3-venv
+```
+* For Amazon Linux 2 Image:
+``` bash
+sudo yum -y install python3 python3-devel python3-distutils python3-venv && sudo yum -y groupinstall 'Development Tools' 
+```
+One needs to install at least one BLAS library to compile PECOS, e.g. `OpenBLAS`:
+* For Ubuntu (18.04, 20.04):
+``` bash
+sudo apt-get install -y libopenblas-dev
+```
+* For Amazon Linux 2 Image and AMI:
+``` bash
+sudo amazon-linux-extras install epel -y
+sudo yum install openblas-devel -y
+```
+
+## Prepare Data
+
+Get the exemplar sift-128-eucldiean dataset
+
+```bash 
+wget https://archive.org/download/pecos-dataset/ann-benchmarks/sift-euclidean-128.tar.gz
+``` 
+
+Extract the dataset
+
+```bash 
+tar -xf sift-euclidean-128.tar.gz
+``` 
+
+The prepared dataset consists of 3 .npy files : X.trn.npy (training data),  X.tst.npy (testing data) and Y.tst.npy (10 Nearest neighbors in training data of test data).
+
+## Compile the source code
+
+```bash 
+Make clean go
+``` 
+
+a runnable named "go" will be generated.
+
+## Running the compiled runnable
+
+the runnable take arguments in the following form :
+```bash 
+./go data_folder model_folder space M efC #threads efs num_rerank sub_dimension
+``` 
+
+data_folder is the place where 3 npy files stored. model_folder is the place to store the trained model. If a saved model is found, we will load the model instead of training a new one. space denotes the distance measure to use. Currently, we only support L2. M is the maximal edge connection used in HNSW. efC is the Maximal connecting edges during construction used in HNSW. #threads is the number of threads to build the graph. Up to now, these hypaer-parameters relate to the construction, and they will be used to name the trained model directory. efs is the search queue size in the inference step. num_rerank is the number of points in the queue that we will further rerank again using original features instead of quantized distance. sub_dimension is the dimension of each subspace in Product Quantization. If sub_dimension is set to 0, it will use default scheme. That is, if original data dimension <= 400, we use sub_dimension == 1, otherwise we use sub_dimension == 2.
+ 
+Here, we provide an example of command executing the runnable : 
+
+```bash 
+./go sift-euclidean-128 sift-euclidean-128 l2 8 500 24 10 10 0
+``` 
+
+## Experiment
+
+The compiled source code in example.cpp already repeats the inference 10 times. So to evaluate under ann-benchmark protocol, we could simply use python to iterate hyper-parameters and record done results. 
+
+```bash 
+python run.py
+``` 
diff --git a/examples/ann-hnsw-pq4bits/example.cpp b/examples/ann-hnsw-pq4bits/example.cpp
@@ -0,0 +1,127 @@
+#include <iostream>
+#include <string>
+#include <unordered_set>
+#include "utils/matrix.hpp"
+#include "utils/scipy_loader.hpp"
+#include "ann/hnsw.hpp"
+
+
+
+class StopW {
+    std::chrono::steady_clock::time_point time_begin;
+public:
+    StopW() {
+        time_begin = std::chrono::steady_clock::now();
+    }
+
+    float getElapsedTimeMicro() {
+        std::chrono::steady_clock::time_point time_end = std::chrono::steady_clock::now();
+        return (std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_begin).count());
+    }
+
+    void reset() {
+        time_begin = std::chrono::steady_clock::now();
+    }
+};
+
+
+int num_rerank;
+int sub_dimension;
+using pecos::ann::index_type;
+
+typedef float32_t value_type;
+typedef uint64_t mem_index_type;
+typedef pecos::NpyArray<value_type> scipy_npy_t;
+
+
+auto npy_to_drm = [](scipy_npy_t& X_npy) -> pecos::drm_t {
+    pecos::drm_t X;
+    X.rows = X_npy.shape[0];
+    X.cols = X_npy.shape[1];
+    X.val = X_npy.array.data();
+    return X;
+};
+
+
+template<typename MAT, typename feat_vec_t>
+void run_dense(std::string data_dir , char* model_path, index_type M, index_type efC, index_type max_level, int threads, int efs) {
+    // data prepare
+    scipy_npy_t X_trn_npy(data_dir + "/X.trn.npy");
+    scipy_npy_t X_tst_npy(data_dir + "/X.tst.npy");
+    scipy_npy_t Y_tst_npy(data_dir + "/Y.tst.npy");
+    auto X_trn = npy_to_drm(X_trn_npy);
+    auto X_tst = npy_to_drm(X_tst_npy);
+    auto Y_tst = npy_to_drm(Y_tst_npy);
+    // model prepare
+    index_type topk = 10;
+    pecos::ann::HNSWProductQuantizer4Bits<float, feat_vec_t> indexer;
+    FILE* fp = fopen(model_path, "rb");
+    if (!fp) {
+        // if subspace_dimension is set to 0, it will use default scheme. That is,
+        // if dimension <= 400, we use subspace_dimension 1, otherwise we use 2.
+        indexer.train(X_trn, M, efC, 0, 200, threads, max_level);
+        std::cout<< "After train" <<std::endl;
+        indexer.save(model_path);
+        std::cout<< "After save" <<std::endl;
+        indexer.load(model_path);
+    } else {
+        indexer.load(model_path);
+        fclose(fp);
+    }
+
+    // prepare searcher for inference
+    index_type num_data = X_tst.rows;
+    auto searcher = indexer.create_searcher();
+    searcher.prepare_inference();
+
+
+    double latency = std::numeric_limits<double>::max();
+    // REPEAT 10 times and report the best result
+    for (int repeat = 0; repeat < 10; repeat++) {
+        double inner_latency = 0.0;
+        for (index_type idx = 0; idx < num_data; idx++) {
+            StopW stopw = StopW();
+            auto ret_pairs = indexer.predict_single(X_tst.get_row(idx), efs, topk, searcher, num_rerank);
+            double ss = stopw.getElapsedTimeMicro();
+            inner_latency += ss;
+        }
+        latency = std::min(latency, inner_latency);
+    }
+    // inference and calculate recalls
+    double recall = 0.0;
+    for (index_type idx = 0; idx < num_data; idx++) {
+        auto ret_pairs = indexer.predict_single(X_tst.get_row(idx), efs, topk, searcher, num_rerank);
+        std::unordered_set<pecos::csr_t::index_type> true_indices;
+
+        for (auto k = 0u; k < topk; k++) {
+            true_indices.insert(Y_tst.get_row(idx).val[k]);  // assume Y_tst is ascendingly sorted by distance
+        }
+        for (auto dist_idx_pair : ret_pairs) {
+            if (true_indices.find(dist_idx_pair.node_id) != true_indices.end()) {
+                recall += 1.0;
+            }
+        }
+    }
+    recall = recall / num_data / topk;
+    latency = latency / num_data / 1000.;
+    std::cout<< recall << " : " << 1.0 / latency * 1e3 << "," <<std::endl;
+}
+
+int main(int argc, char** argv) {
+    std::string data_dir = argv[1];
+    std::string model_dir = argv[2];
+    std::string space_name = argv[3];
+    index_type M = (index_type) atoi(argv[4]);
+    index_type efC = (index_type) atoi(argv[5]);
+    int threads = atoi(argv[6]);
+    int efs = atoi(argv[7]);
+    num_rerank = atoi(argv[8]);
+    sub_dimension = atoi(argv[9]);
+    index_type max_level = 8;
+    char model_path[2048];
+    sprintf(model_path, "%s/pecos.%s.M-%d_efC-%d_t-%d_d-%d.bin", model_dir.c_str(), space_name.c_str(), M, efC, threads, sub_dimension);
+    // currently only support l2
+    if (space_name.compare("l2") == 0) {
+        run_dense<pecos::drm_t, pecos::ann::FeatVecDenseL2Simd<float>>(data_dir, model_path, M, efC, max_level, threads, efs);
+    }
+}
diff --git a/examples/ann-hnsw-pq4bits/run.py b/examples/ann-hnsw-pq4bits/run.py
@@ -0,0 +1,9 @@
+import os
+cmd = "./go sift-euclidean-128 sift-euclidean-128 l2 %d 500 24 %d %d 0"
+for args in [8, 16, 24, 36, 48, 64, 96]:
+    for efs in [10, 20, 40, 80, 120, 200, 400]:
+        os.system(cmd % (args, efs, efs))
+        if efs - 20 > 0:
+            os.system(cmd % (args, efs, 20))
+        if efs - 50 > 0:
+            os.system(cmd % (args, efs, 50))