diff --git a/pecos/core/ann/hnsw.hpp b/pecos/core/ann/hnsw.hpp index 8b53e1f..f9954b0 100644 --- a/pecos/core/ann/hnsw.hpp +++ b/pecos/core/ann/hnsw.hpp @@ -11,24 +11,29 @@ * and limitations under the License. */ +#include +#include #include #include +#include +#include +#include #include #include -#include -#include #include -#include #include -#include +#include +#include + + +#include "ann/feat_vectors.hpp" +#include "ann/quantizer.hpp" +#include "third_party/nlohmann_json/json.hpp" +#include "utils/file_util.hpp" #include "utils/matrix.hpp" #include "utils/random.hpp" -#include "utils/file_util.hpp" #include "utils/type_util.hpp" -#include "ann/feat_vectors.hpp" -#include "third_party/nlohmann_json/json.hpp" - namespace pecos { @@ -101,12 +106,12 @@ namespace ann { pecos::file_util::fput_multiple(&node_mem_size, 1, fp); size_t sz = mem_start_of_node.size(); pecos::file_util::fput_multiple(&sz, 1, fp); - if(sz) { + if (sz) { pecos::file_util::fput_multiple(&mem_start_of_node[0], sz, fp); } sz = buffer.size(); pecos::file_util::fput_multiple(&sz, 1, fp); - if(sz) { + if (sz) { pecos::file_util::fput_multiple(&buffer[0], sz, fp); } } @@ -119,12 +124,12 @@ namespace ann { size_t sz = 0; pecos::file_util::fget_multiple(&sz, 1, fp); mem_start_of_node.resize(sz); - if(sz) { + if (sz) { pecos::file_util::fget_multiple(&mem_start_of_node[0], sz, fp); } pecos::file_util::fget_multiple(&sz, 1, fp); buffer.resize(sz); - if(sz) { + if (sz) { pecos::file_util::fget_multiple(&buffer[0], sz, fp); } } @@ -136,17 +141,17 @@ namespace ann { this->max_degree = max_degree; mem_start_of_node.resize(num_node + 1); mem_start_of_node[0] = 0; - for(size_t i = 0; i < num_node; i++) { + for (size_t i = 0; i < num_node; i++) { const feat_vec_t& xi(feat_mat.get_row(i)); mem_start_of_node[i + 1] = mem_start_of_node[i] + neighborhood_memory_size() + xi.memory_size(); } buffer.resize(mem_start_of_node[num_node], 0); - if(feat_vec_t::is_fixed_size::value) { + if (feat_vec_t::is_fixed_size::value) { node_mem_size = buffer.size() / num_node; } // get_node_feat_ptr must appear after memory allocation (buffer.resize()) - for(size_t i = 0; i < num_node; i++) { + for (size_t i = 0; i < num_node; i++) { const feat_vec_t& xi(feat_mat.get_row(i)); xi.copy_to(get_node_feat_ptr(i)); } @@ -165,7 +170,7 @@ namespace ann { } inline const void* get_node_feat_ptr(index_type node_id) const { - if(feat_vec_t::is_fixed_size::value) { + if (feat_vec_t::is_fixed_size::value) { return &buffer[node_id * (mem_index_type) node_mem_size + neighborhood_memory_size()]; } else { return &buffer[mem_start_of_node[node_id] + neighborhood_memory_size()]; @@ -178,7 +183,7 @@ namespace ann { inline const NeighborHood get_neighborhood(index_type node_id, index_type dummy_level_id=0) const { const index_type *neighborhood_ptr = nullptr; - if(feat_vec_t::is_fixed_size::value) { + if (feat_vec_t::is_fixed_size::value) { neighborhood_ptr = reinterpret_cast(&buffer.data()[node_id * (mem_index_type) node_mem_size]); } else { neighborhood_ptr = reinterpret_cast(&buffer[mem_start_of_node[node_id]]); @@ -203,7 +208,7 @@ namespace ann { pecos::file_util::fput_multiple(&level_mem_size, 1, fp); size_t sz = buffer.size(); pecos::file_util::fput_multiple(&sz, 1, fp); - if(sz) { + if (sz) { pecos::file_util::fput_multiple(&buffer[0], sz, fp); } } @@ -217,7 +222,7 @@ namespace ann { size_t sz = 0; pecos::file_util::fget_multiple(&sz, 1, fp); buffer.resize(sz); - if(sz) { + if (sz) { pecos::file_util::fget_multiple(&buffer[0], sz, fp); } } @@ -238,6 +243,148 @@ namespace ann { } }; + template + struct GraphProductQuantizer4Bits : GraphBase { + typedef FeatVec_T feat_vec_t; + ProductQuantizer4Bits quantizer; + index_type num_node; + // code_dimension is number of 4 bits code used to encode a data point in GraphPQ4Bits + // code_dimension can be different from parameter num_local_codebooks in quantizer + // as we might adjust code_dimension to make it divisble by 4. More details can be + // found in pad_parameters function of ann/quantizer_impl/x86.hpp + size_t code_dimension; + // code_offset helps to locate memory position containing neighboring codes + size_t code_offset; + size_t node_mem_size; + index_type max_degree; + std::vector mem_start_of_node; + std::vector buffer; + + void save(FILE *fp) const { + pecos::file_util::fput_multiple(&num_node, 1, fp); + pecos::file_util::fput_multiple(&code_dimension, 1, fp); + pecos::file_util::fput_multiple(&code_offset, 1, fp); + pecos::file_util::fput_multiple(&node_mem_size, 1, fp); + pecos::file_util::fput_multiple(&max_degree, 1, fp); + size_t sz = mem_start_of_node.size(); + pecos::file_util::fput_multiple(&sz, 1, fp); + if (sz) { + pecos::file_util::fput_multiple(&mem_start_of_node[0], sz, fp); + } + sz = buffer.size(); + pecos::file_util::fput_multiple(&sz, 1, fp); + if (sz) { + pecos::file_util::fput_multiple(&buffer[0], sz, fp); + } + quantizer.save(fp); + fclose(fp); + } + + void load(FILE *fp) { + pecos::file_util::fget_multiple(&num_node, 1, fp); + pecos::file_util::fget_multiple(&code_dimension, 1, fp); + pecos::file_util::fget_multiple(&code_offset, 1, fp); + pecos::file_util::fget_multiple(&node_mem_size, 1, fp); + pecos::file_util::fget_multiple(&max_degree, 1, fp); + size_t sz = 0; + pecos::file_util::fget_multiple(&sz, 1, fp); + mem_start_of_node.resize(sz); + if (sz) { + pecos::file_util::fget_multiple(&mem_start_of_node[0], sz, fp); + } + pecos::file_util::fget_multiple(&sz, 1, fp); + buffer.resize(sz); + if (sz) { + pecos::file_util::fget_multiple(&buffer[0], sz, fp); + } + + quantizer.load(fp); + + fclose(fp); + } + + + void build_quantizer(const pecos::drm_t& X_trn, index_type subspace_dimension, index_type sub_sample_points) { + size_t code_dimension = X_trn.cols; + if (subspace_dimension == 0) { + if (code_dimension >= 400) { + code_dimension = code_dimension % 2 == 0 ? code_dimension / 2 : code_dimension / 2 + 1; + } + } else { + code_dimension = code_dimension / subspace_dimension; + } + // currently, we don't support padding 0 on X_trn, so the cols of X_trn must be divisible by subspace_dimension. + // otherwise, we will throw error in quantizer.train(). + quantizer.train(X_trn, code_dimension, sub_sample_points); + quantizer.pack_codebook_for_inference(); + this->code_dimension = code_dimension; + } + + void build_graph(GraphL0& G) { + max_degree = G.max_degree; + quantizer.pad_parameters(max_degree, code_dimension); + num_node = G.num_node; + size_t num_of_local_centroids = quantizer.num_of_local_centroids; + size_t neighbor_size = (1 + max_degree) * sizeof(index_type); + code_offset = neighbor_size; + + std::vector> X_trn_codes(num_node, std::vector (code_dimension, 0)); + for (size_t i = 0 ; i < num_node ; i++) { + quantizer.encode(G.get_node_feat(i).val, X_trn_codes[i].data()); + } + + node_mem_size = neighbor_size + max_degree * code_dimension / 2; + mem_start_of_node.resize(num_node + 1); + mem_start_of_node[0] = 0; + + for (size_t i = 0; i < num_node; i++) { + mem_start_of_node[i + 1] = mem_start_of_node[i] + node_mem_size; + } + + buffer.resize(mem_start_of_node[num_node], 0); + + for (size_t i = 0; i < num_node; i++) { + std::vector> neighbor_codes(max_degree, std::vector (code_dimension, 0)); + + memcpy(&buffer[mem_start_of_node[i]], &G.buffer[G.mem_start_of_node[i]], (1 + G.max_degree) * sizeof(index_type)); + + index_type size = *reinterpret_cast(&G.buffer[G.mem_start_of_node[i]]); + for (index_type j = 0; j < size; j++) { + index_type member = *reinterpret_cast(&G.buffer[G.mem_start_of_node[i] + sizeof(index_type) + j * sizeof(index_type)]); + memcpy(neighbor_codes[j].data(), X_trn_codes[member].data(), code_dimension); + } + + index_type processed_num_of_neighbors = 0; + std::vector group_transposed_graph_codes(max_degree / 2 * code_dimension, 0); + + while (processed_num_of_neighbors < size) { + std::vector group_code(num_of_local_centroids / 2 * code_dimension, 0); + + for (index_type k = 0; k < code_dimension; k++) { + for (index_type j = 0; j < num_of_local_centroids; j += 2) { + uint8_t obj = neighbor_codes[processed_num_of_neighbors + j][k]; + obj += (neighbor_codes[processed_num_of_neighbors + j + 1][k] << 4); + group_code[k * num_of_local_centroids / 2 + j / 2] = obj; + } + } + memcpy(&group_transposed_graph_codes[processed_num_of_neighbors * code_dimension / 2], &group_code[0], num_of_local_centroids * code_dimension / 2); + processed_num_of_neighbors += num_of_local_centroids; + } + memcpy(&buffer[mem_start_of_node[i] + (1 + max_degree) * sizeof(index_type)], group_transposed_graph_codes.data(), max_degree * code_dimension / 2); + } + } + + inline const char* get_neighbor_codes(index_type node_id) const { + return &buffer[mem_start_of_node[node_id] + code_offset]; + } + inline const NeighborHood get_neighborhood(index_type node_id, index_type dummy_level_id=0) const { + const index_type *neighborhood_ptr = nullptr; + neighborhood_ptr = reinterpret_cast(&buffer.data()[node_id * (mem_index_type) node_mem_size]); + return NeighborHood((void*)neighborhood_ptr); + } + }; + + template struct SetOfVistedNodes { T init_token, curr_token; @@ -255,7 +402,7 @@ namespace ann { // amortized time complexity is O(num_nodes / std::numeric_limits::max()) void reset() { curr_token += 1; - if(curr_token == init_token) { + if (curr_token == init_token) { std::fill(buffer.begin(), buffer.end(), init_token); curr_token = init_token + 1; } @@ -351,9 +498,9 @@ namespace ann { // scalar variables index_type num_node; - index_type maxM; // max number of out-degree for level l=1,...,L - index_type maxM0; // max number of out-degree for level l=0 - index_type efC; // size of priority queue for construction time + index_type maxM; // max number of out-degree for level l=1,...,L + index_type maxM0; // max number of out-degree for level l=0 + index_type efC; // size of priority queue for construction time index_type max_level; index_type init_node; @@ -367,7 +514,7 @@ namespace ann { static nlohmann::json load_config(const std::string& filepath) { std::ifstream loadfile(filepath); std::string json_str; - if(loadfile.is_open()) { + if (loadfile.is_open()) { json_str.assign( std::istreambuf_iterator(loadfile), std::istreambuf_iterator() @@ -378,7 +525,7 @@ namespace ann { auto j_param = nlohmann::json::parse(json_str); std::string hnsw_t_cur = pecos::type_util::full_name(); std::string hnsw_t_inp = j_param["hnsw_t"]; - if(hnsw_t_cur != hnsw_t_inp) { + if (hnsw_t_cur != hnsw_t_inp) { throw std::invalid_argument("Inconsistent HNSW_T: hnsw_t_cur = " + hnsw_t_cur + " hnsw_t_cur = " + hnsw_t_inp); } return j_param; @@ -399,7 +546,7 @@ namespace ann { } }; std::ofstream savefile(filepath, std::ofstream::trunc); - if(savefile.is_open()) { + if (savefile.is_open()) { savefile << j_params.dump(4); savefile.close(); } else { @@ -408,8 +555,8 @@ namespace ann { } void save(const std::string& model_dir) const { - if(mkdir(model_dir.c_str(), 0777) == -1) { - if(errno != EEXIST) { + if (mkdir(model_dir.c_str(), 0777) == -1) { + if (errno != EEXIST) { throw std::runtime_error("Unable to create save folder at " + model_dir); } } @@ -432,7 +579,7 @@ namespace ann { std::string version = config.find("version") != config.end() ? config["version"] : "not found"; std::string index_path = model_dir + "/index.bin"; FILE *fp = fopen(index_path.c_str(), "rb"); - if(version == "v1.0") { + if (version == "v1.0") { pecos::file_util::fget_multiple(&num_node, 1, fp); pecos::file_util::fget_multiple(&maxM, 1, fp); pecos::file_util::fget_multiple(&maxM0, 1, fp); @@ -449,7 +596,7 @@ namespace ann { // Algorithm 4 of HNSW paper void get_neighbors_heuristic(max_heap_t &top_candidates, const index_type M) { - if(top_candidates.size() < M) { return; } + if (top_candidates.size() < M) { return; } min_heap_t queue_closest; std::vector return_list; @@ -459,7 +606,7 @@ namespace ann { } while (queue_closest.size()) { - if(return_list.size() >= M) { + if (return_list.size() >= M) { break; } auto curent_pair = queue_closest.top(); @@ -467,22 +614,22 @@ namespace ann { queue_closest.pop(); bool good = true; - for(auto& second_pair : return_list) { + for (auto& second_pair : return_list) { dist_t curdist = feat_vec_t::distance( graph_l0.get_node_feat(second_pair.node_id), graph_l0.get_node_feat(curent_pair.node_id) ); - if(curdist < dist_to_query) { + if (curdist < dist_to_query) { good = false; break; } } - if(good) { + if (good) { return_list.push_back(curent_pair); } } - for(auto& curent_pair : return_list) { + for (auto& curent_pair : return_list) { top_candidates.emplace(curent_pair); } } @@ -493,7 +640,7 @@ namespace ann { index_type mutually_connect(index_type src_node_id, max_heap_t &top_candidates, index_type level, std::vector* mtx_nodes=nullptr) { index_type Mcurmax = level ? this->maxM : this->maxM0; get_neighbors_heuristic(top_candidates, this->maxM); - if(top_candidates.size() > this->maxM) { + if (top_candidates.size() > this->maxM) { throw std::runtime_error("Should be not be more than M_ candidates returned by the heuristic"); } @@ -505,7 +652,7 @@ namespace ann { } GraphBase *G; - if(level == 0) { + if (level == 0) { G = &graph_l0; } else { G = &graph_l1; @@ -513,18 +660,18 @@ namespace ann { auto add_link = [&](index_type src, index_type dst) { std::unique_lock* lock_src = nullptr; - if(!lock_free) { + if (!lock_free) { lock_src = new std::unique_lock(mtx_nodes->at(src)); } auto neighbors = G->get_neighborhood(src, level); - if(neighbors.degree() > Mcurmax) + if (neighbors.degree() > Mcurmax) throw std::runtime_error("Bad value of size of neighbors for this src node"); - if(src == dst) + if (src == dst) throw std::runtime_error("Trying to connect an element to itself"); - if(neighbors.degree() < Mcurmax) { + if (neighbors.degree() < Mcurmax) { neighbors.push_back(dst); } else { // finding the "weakest" element to replace it with the new one @@ -535,7 +682,7 @@ namespace ann { // Heuristic: max_heap_t candidates; candidates.emplace(d_max, dst); - for(auto& dst : neighbors) { + for (auto& dst : neighbors) { dist_t dist_j = feat_vec_t::distance( graph_l0.get_node_feat(src), graph_l0.get_node_feat(dst) @@ -553,12 +700,12 @@ namespace ann { } } - if(!lock_free) { + if (!lock_free) { delete lock_src; } }; - for(auto& dst : selected_neighbors) { + for (auto& dst : selected_neighbors) { add_link(src_node_id, dst); add_link(dst, src_node_id); } @@ -571,7 +718,6 @@ namespace ann { // if max_level_upper_bound >= 0, the number of lavels in the hierarchical part is upper bounded by the give number template void train(const MAT_T &X_trn, index_type M, index_type efC, int threads=1, int max_level_upper_bound=-1) { - // workspace to store thread-local variables struct workspace_t { HNSW& hnsw; @@ -581,7 +727,7 @@ namespace ann { std::vector searchers; workspace_t(HNSW& hnsw, int threads=1): hnsw(hnsw), mtx_nodes(hnsw.num_node), node2level(hnsw.num_node) { - for(int i = 0; i < threads; i++) { + for (int i = 0; i < threads; i++) { searchers.emplace_back(Searcher(&hnsw)); } } @@ -599,7 +745,7 @@ namespace ann { // obtain the global lock as we might need to change max_level and init_node std::unique_lock* lock_global = nullptr; - if(query_level > hnsw.max_level) { + if (query_level > hnsw.max_level) { lock_global = new std::unique_lock(ws.mtx_global); } @@ -610,29 +756,29 @@ namespace ann { const feat_vec_t& query_feat = graph_l0.get_node_feat(query_id); bool is_first_node = (query_id == 0); - if(is_first_node) { + if (is_first_node) { hnsw.init_node = query_id; hnsw.max_level = query_level; } else { // find entrypoint with efS = 1 from level = local max_level to 1. - if(query_level < max_level) { + if (query_level < max_level) { dist_t curr_dist = feat_vec_t::distance( query_feat, graph_l0.get_node_feat(curr_node) ); - for(auto level = max_level; level > query_level; level--) { + for (auto level = max_level; level > query_level; level--) { bool changed = true; while (changed) { changed = false; std::unique_lock lock_node(ws.mtx_nodes[curr_node]); auto neighbors = graph_l1.get_neighborhood(curr_node, level); - for(auto& next_node : neighbors) { + for (auto& next_node : neighbors) { dist_t next_dist = feat_vec_t::distance( query_feat, graph_l0.get_node_feat(next_node) ); - if(next_dist < curr_dist) { + if (next_dist < curr_dist) { curr_dist = next_dist; curr_node = next_node; changed = true; @@ -641,31 +787,31 @@ namespace ann { } } } - if(lock_free) { - for(auto level = std::min(query_level, max_level); ; level--) { + if (lock_free) { + for (auto level = std::min(query_level, max_level); ; level--) { auto& top_candidates = search_level(query_feat, curr_node, this->efC, level, searcher, &ws.mtx_nodes); curr_node = mutually_connect(query_id, top_candidates, level, &ws.mtx_nodes); - if(level == 0) { break; } + if (level == 0) { break; } } } else { - for(auto level = std::min(query_level, max_level); ; level--) { + for (auto level = std::min(query_level, max_level); ; level--) { auto& top_candidates = search_level(query_feat, curr_node, this->efC, level, searcher, &ws.mtx_nodes); curr_node = mutually_connect(query_id, top_candidates, level, &ws.mtx_nodes); - if(level == 0) { break; } + if (level == 0) { break; } } } // if(query_level > ws.node2level[hnsw.enterpoint_id]) // used in nmslib. - if(query_level > hnsw.max_level) { // used in hnswlib. + if (query_level > hnsw.max_level) { // used in hnswlib. hnsw.max_level = query_level; hnsw.init_node = query_id; } } - if(lock_global != nullptr) { + if (lock_global != nullptr) { delete lock_global; } - }; // end of add_point + }; // end of add_point this->num_node = X_trn.rows; this->maxM = M; @@ -679,13 +825,13 @@ namespace ann { // pre-compute level for each node auto& node2level = ws.node2level; node2level.resize(num_node); - const float mult_l = 1.0 / log(1.0 * this->maxM); // m_l in Sec 4.1 of the HNSW paper + const float mult_l = 1.0 / log(1.0 * this->maxM); // m_l in Sec 4.1 of the HNSW paper random_number_generator<> rng; - for(index_type node_id = 0; node_id < num_node; node_id++) { + for (index_type node_id = 0; node_id < num_node; node_id++) { // line 4 of Algorithm 1 in HNSW paper node2level[node_id] = (index_type)(-log(rng.uniform(0.0, 1.0)) * mult_l); // if max_level_upper_bound is given, we cap the the level - if(max_level_upper_bound >= 0) { + if (max_level_upper_bound >= 0) { node2level[node_id] = std::min(node2level[node_id], (index_type)max_level_upper_bound); } } @@ -700,7 +846,7 @@ namespace ann { bool lock_free = (threads == 1); #pragma omp parallel for schedule(dynamic, 1) - for(index_type node_id = 0; node_id < num_node; node_id++) { + for (index_type node_id = 0; node_id < num_node; node_id++) { int thread_id = omp_get_thread_num(); add_point(node_id, ws, thread_id, lock_free); } @@ -712,31 +858,31 @@ namespace ann { auto& queue = ws.searchers[thread_id].cand_queue; const auto &src = graph_l0.get_node_feat(node_id); - for(index_type level = 0; level <= ws.node2level[node_id]; level++) { + for (index_type level = 0; level <= ws.node2level[node_id]; level++) { GraphBase *G; - if(level == 0) { + if (level == 0) { G = &graph_l0; } else { G = &graph_l1; } auto neighbors = G->get_neighborhood(node_id, level); - if(neighbors.degree() == 0) { + if (neighbors.degree() == 0) { return; } queue.clear(); - for(index_type j = 0; j < neighbors.degree(); j++) { + for (index_type j = 0; j < neighbors.degree(); j++) { const auto& dst = graph_l0.get_node_feat(neighbors[j]); queue.emplace_back(feat_vec_t::distance(src, dst), neighbors[j]); } std::sort(queue.begin(), queue.end()); - for(index_type j = 0; j < neighbors.degree(); j++) { + for (index_type j = 0; j < neighbors.degree(); j++) { neighbors[j] = queue[j].node_id; } } }; #pragma omp parallel for schedule(dynamic, 1) - for(index_type node_id = 0; node_id < num_node; node_id++) { + for (index_type node_id = 0; node_id < num_node; node_id++) { int thread_id = omp_get_thread_num(); sort_neighbors_for_node(node_id, ws, thread_id); } @@ -758,14 +904,13 @@ namespace ann { dist_t topk_ub_dist = feat_vec_t::distance( query, - graph_l0.get_node_feat(init_node) - ); + graph_l0.get_node_feat(init_node)); topk_queue.emplace(topk_ub_dist, init_node); cand_queue.emplace(topk_ub_dist, init_node); searcher.mark_visited(init_node); const GraphBase *G; - if(level == 0) { + if (level == 0) { G = &graph_l0; } else { G = &graph_l1; @@ -774,39 +919,39 @@ namespace ann { // Best First Search loop while (!cand_queue.empty()) { pair_t cand_pair = cand_queue.top(); - if(cand_pair.dist > topk_ub_dist) { + if (cand_pair.dist > topk_ub_dist) { break; } cand_queue.pop(); index_type cand_node = cand_pair.node_id; std::unique_lock* lock_node = nullptr; - if(!lock_free){ + if (!lock_free) { lock_node = new std::unique_lock(mtx_nodes->at(cand_node)); } // visiting neighbors of candidate node const auto neighbors = G->get_neighborhood(cand_node, level); - if(neighbors.degree() != 0) { + if (neighbors.degree() != 0) { graph_l0.prefetch_node_feat(neighbors[0]); index_type max_j = neighbors.degree() - 1; - for(index_type j = 0; j <= max_j; j++) { + for (index_type j = 0; j <= max_j; j++) { graph_l0.prefetch_node_feat(neighbors[std::min(j + 1, max_j)]); auto next_node = neighbors[j]; - if(!searcher.is_visited(next_node)) { + if (!searcher.is_visited(next_node)) { searcher.mark_visited(next_node); dist_t next_lb_dist; next_lb_dist = feat_vec_t::distance( query, graph_l0.get_node_feat(next_node) ); - if(topk_queue.size() < efS || next_lb_dist < topk_ub_dist) { + if (topk_queue.size() < efS || next_lb_dist < topk_ub_dist) { cand_queue.emplace(next_lb_dist, next_node); graph_l0.prefetch_node_feat(cand_queue.top().node_id); topk_queue.emplace(next_lb_dist, next_node); - if(topk_queue.size() > efS) { + if (topk_queue.size() > efS) { topk_queue.pop(); } - if(!topk_queue.empty()) { + if (!topk_queue.empty()) { topk_ub_dist = topk_queue.top().dist; } } @@ -814,7 +959,7 @@ namespace ann { } } - if(!lock_free){ + if (!lock_free) { delete lock_node; } } @@ -831,22 +976,22 @@ namespace ann { query, G0.get_node_feat(init_node) ); - for(index_type curr_level = this->max_level; curr_level >= 1; curr_level--) { + for (index_type curr_level = this->max_level; curr_level >= 1; curr_level--) { bool changed = true; while (changed) { changed = false; const auto neighbors = G1.get_neighborhood(curr_node, curr_level); - if(neighbors.degree() != 0) { + if (neighbors.degree() != 0) { graph_l0.prefetch_node_feat(neighbors[0]); index_type max_j = neighbors.degree() - 1; - for(index_type j = 0; j <= max_j; j++) { + for (index_type j = 0; j <= max_j; j++) { graph_l0.prefetch_node_feat(neighbors[std::min(j + 1, max_j)]); auto next_node = neighbors[j]; dist_t next_dist = feat_vec_t::distance( query, G0.get_node_feat(next_node) ); - if(next_dist < curr_dist) { + if (next_dist < curr_dist) { curr_dist = next_dist; curr_node = next_node; changed = true; @@ -858,7 +1003,7 @@ namespace ann { // generalized search_level for level=0 for efS >= 1 searcher.search_level(query, curr_node, std::max(efS, topk), 0); auto& topk_queue = searcher.topk_queue; - if(topk < efS) { + if (topk < efS) { // remove extra when efS > topk while (topk_queue.size() > topk) { topk_queue.pop(); @@ -870,5 +1015,360 @@ namespace ann { }; -} // end of namespace ann -} // end of namespace pecos + template + struct HNSWProductQuantizer4Bits { + typedef FeatVec_T feat_vec_t; + typedef Pair pair_t; + typedef heap_t> max_heap_t; + typedef heap_t> min_heap_t; + + + // scalar variables + index_type num_node; + index_type maxM; // max number of out-degree for level l=1,...,L + index_type maxM0; // max number of out-degree for level l=0 + index_type efC; // size of priority queue for construction time + index_type max_level; + index_type init_node; + index_type subspace_dimension; // dimension of each subspace in Product Quantization + index_type sub_sample_points; // number of sub-sampled points used to build quantizer subspace centors. + + GraphL0 feature_vec; // feature vectors only + GraphL1 graph_l1; // neighborhood graphs from level 1 and above + GraphProductQuantizer4Bits graph_l0_pq4; // Productquantized4Bits neighborhood graph built from graph_l0 + HNSWProductQuantizer4Bits() { + std::string space_type = pecos::type_util::full_name(); + if (space_type != "pecos::ann::FeatVecDenseL2Simd") { + throw std::runtime_error("Currently, we only support L2 distance with float type."); + } + } + ~HNSWProductQuantizer4Bits() {} + struct Searcher : SetOfVistedNodes { + typedef SetOfVistedNodes set_of_visited_nodes_t; + typedef HNSWProductQuantizer4Bits hnswpq4_t; + typedef heap_t> max_heap_t; + typedef heap_t> min_heap_t; + + const hnswpq4_t* hnsw; + max_heap_t topk_queue; + min_heap_t cand_queue; + alignas(64) std::vector lut; + alignas(64) std::vector appx_dist; + float scale; + float bias; + + Searcher(const hnswpq4_t* _hnsw=nullptr): + SetOfVistedNodes(_hnsw? _hnsw->num_node : 0), + hnsw(_hnsw) + {} + + void reset() { + set_of_visited_nodes_t::reset(); + topk_queue.clear(); + cand_queue.clear(); + } + + void prepare_inference() { + auto num_of_local_centroids = hnsw->graph_l0_pq4.quantizer.num_of_local_centroids; + auto max_degree = hnsw->graph_l0_pq4.max_degree; + auto num_local_codebooks = hnsw->graph_l0_pq4.quantizer.num_local_codebooks; + + // When using AVX512f, we have 16 centroids per local codebook, and each of it uses 8 bits to represent quantized + // distance value. Thus,m we will have 128 bits to load 1 set of local codebooks. Thus, a loadu_si512 will load + // 512 / 128 == 4 local codebooks at a time. Thus, the lookup table size needs to be adjusted (padding 0) if + // if num_local_codebooks is not divisible by 4. + size_t adjusted_num_local_codebooks = num_local_codebooks % 4 == 0 ? num_local_codebooks : (num_local_codebooks / 4 + 1) * 4; + + // Similarly, we have to parse every 16 neighbors at a time to maximally leverage avx512f. + // Thus, we have to prepare result array which is multiple of 16 to make sure the SIMD + // will not touch unavailable memory + size_t adjusted_max_degree = max_degree % 16 == 0 ? max_degree : ((max_degree / 16) + 1) * 16; + + lut.resize(num_of_local_centroids * adjusted_num_local_codebooks, 0); + appx_dist.resize(adjusted_max_degree, 0); + } + void setup_lut(float* query) { + hnsw->graph_l0_pq4.quantizer.setup_lut(query, lut.data(), scale, bias); + } + void approximate_distance(size_t neighbor_size, const char* neighbor_codes) { + // pass searcher to group_distance + hnsw->graph_l0_pq4.quantizer.approximate_neighbor_group_distance(neighbor_size, appx_dist.data(), neighbor_codes, lut.data(), scale, bias); + } + + max_heap_t& search_level(const feat_vec_t& query, index_type init_node, index_type efS, index_type level) { + return hnsw->search_level(query, init_node, efS, level, *this); + } + + max_heap_t& predict_single(const feat_vec_t& query, index_type efS, index_type topk, index_type num_rerank) { + return hnsw->predict_single(query, efS, topk, *this, num_rerank); + } + }; + + Searcher create_searcher() const { + return Searcher(this); + } + + + static nlohmann::json load_config(const std::string& filepath) { + std::ifstream loadfile(filepath); + std::string json_str; + if (loadfile.is_open()) { + json_str.assign( + std::istreambuf_iterator(loadfile), + std::istreambuf_iterator()); + } else { + throw std::runtime_error("Unable to open config file at " + filepath); + } + auto j_param = nlohmann::json::parse(json_str); + std::string hnsw_t_cur = pecos::type_util::full_name(); + std::string hnsw_t_inp = j_param["hnsw_t"]; + if (hnsw_t_cur != hnsw_t_inp) { + throw std::invalid_argument("Inconsistent HNSW_T: hnsw_t_cur = " + hnsw_t_cur + " hnsw_t_cur = " + hnsw_t_inp); + } + return j_param; + } + + void save_config(const std::string& filepath) const { + nlohmann::json j_params = { + {"hnsw_t", pecos::type_util::full_name()}, + {"version", "v1.0"}, + {"train_params", { + {"num_node", this->num_node}, + {"subspace_dimension", this->subspace_dimension}, + {"sub_sample_points", this->sub_sample_points}, + {"maxM", this->maxM}, + {"maxM0", this->maxM0}, + {"efC", this->efC}, + {"max_level", this->max_level}, + {"init_node", this->init_node} + } + } + }; + std::ofstream savefile(filepath, std::ofstream::trunc); + if (savefile.is_open()) { + savefile << j_params.dump(4); + savefile.close(); + } else { + throw std::runtime_error("Unable to save config file to " + filepath); + } + } + + void save(const std::string& model_dir) const { + if (mkdir(model_dir.c_str(), 0777) == -1) { + if (errno != EEXIST) { + throw std::runtime_error("Unable to create save folder at " + model_dir); + } + } + save_config(model_dir + "/config.json"); + std::string index_path = model_dir + "/index.bin"; + FILE *fp = fopen(index_path.c_str(), "wb"); + pecos::file_util::fput_multiple(&num_node, 1, fp); + pecos::file_util::fput_multiple(&maxM, 1, fp); + pecos::file_util::fput_multiple(&maxM0, 1, fp); + pecos::file_util::fput_multiple(&efC, 1, fp); + pecos::file_util::fput_multiple(&max_level, 1, fp); + pecos::file_util::fput_multiple(&init_node, 1, fp); + pecos::file_util::fput_multiple(&subspace_dimension, 1, fp); + pecos::file_util::fput_multiple(&sub_sample_points, 1, fp); + feature_vec.save(fp); + graph_l1.save(fp); + graph_l0_pq4.save(fp); + fclose(fp); + } + + void load(const std::string& model_dir) { + auto config = load_config(model_dir + "/config.json"); + std::string version = config.find("version") != config.end() ? config["version"] : "not found"; + std::string index_path = model_dir + "/index.bin"; + FILE *fp = fopen(index_path.c_str(), "rb"); + if (version == "v1.0") { + pecos::file_util::fget_multiple(&num_node, 1, fp); + pecos::file_util::fget_multiple(&maxM, 1, fp); + pecos::file_util::fget_multiple(&maxM0, 1, fp); + pecos::file_util::fget_multiple(&efC, 1, fp); + pecos::file_util::fget_multiple(&max_level, 1, fp); + pecos::file_util::fget_multiple(&init_node, 1, fp); + pecos::file_util::fget_multiple(&subspace_dimension, 1, fp); + pecos::file_util::fget_multiple(&sub_sample_points, 1, fp); + feature_vec.load(fp); + graph_l1.load(fp); + graph_l0_pq4.load(fp); + } else { + throw std::runtime_error("Unable to load this binary with version = " + version); + } + fclose(fp); + } + + template + void train( + const MAT_T &X_trn, + index_type M, + index_type efC, + index_type subspace_dimension=0, + index_type sub_sample_points=0, + int threads=1, + int max_level_upper_bound=-1 + ) { + HNSW* hnsw = new HNSW(); + hnsw->train(X_trn, M, efC, threads, max_level_upper_bound); + this->num_node = hnsw->num_node; + this->maxM = hnsw->maxM; + this->maxM0 = hnsw->maxM0; + this->efC = hnsw->efC; + this->max_level = hnsw->max_level; + this->init_node = hnsw->init_node; + this->subspace_dimension = subspace_dimension; + this->sub_sample_points = sub_sample_points; + + graph_l1.num_node = hnsw->graph_l1.num_node; + graph_l1.max_level = hnsw->graph_l1.max_level; + graph_l1.max_degree = hnsw->graph_l1.max_degree; + graph_l1.node_mem_size = hnsw->graph_l1.node_mem_size; + graph_l1.level_mem_size = hnsw->graph_l1.level_mem_size; + graph_l1.buffer.resize(hnsw->graph_l1.buffer.size()); + memcpy(graph_l1.buffer.data(), hnsw->graph_l1.buffer.data(), hnsw->graph_l1.buffer.size() * sizeof(index_type)); + + graph_l0_pq4.build_quantizer(X_trn, subspace_dimension, sub_sample_points); + graph_l0_pq4.build_graph(hnsw->graph_l0); + delete hnsw; + feature_vec.init(X_trn, -1); + } + + + max_heap_t& predict_single(const feat_vec_t& query, index_type efS, index_type topk, Searcher& searcher, index_type num_rerank) const { + index_type curr_node = this->init_node; + auto &G1 = graph_l1; + auto &G0 = feature_vec; + // specialized search_level for level l=1,...,L because its faster for efS=1 + dist_t curr_dist = feat_vec_t::distance( + query, + G0.get_node_feat(init_node) + ); + for (index_type curr_level = this->max_level; curr_level >= 1; curr_level--) { + bool changed = true; + while (changed) { + changed = false; + const auto neighbors = G1.get_neighborhood(curr_node, curr_level); + if (neighbors.degree() != 0) { + feature_vec.prefetch_node_feat(neighbors[0]); + index_type max_j = neighbors.degree() - 1; + for (index_type j = 0; j <= max_j; j++) { + feature_vec.prefetch_node_feat(neighbors[std::min(j + 1, max_j)]); + auto next_node = neighbors[j]; + dist_t next_dist = feat_vec_t::distance( + query, + G0.get_node_feat(next_node) + ); + if (next_dist < curr_dist) { + curr_dist = next_dist; + curr_node = next_node; + changed = true; + } + } + } + } + } + // generalized search_level for level=0 for efS >= 1 + searcher.search_level(query, curr_node, std::max(efS, topk), 0); + auto& topk_queue = searcher.topk_queue; + + + if (num_rerank > 0) { + index_type t_size = topk_queue.size() > num_rerank ? topk_queue.size() - num_rerank : 0; + for (index_type i = 0; i < t_size; i++) { + topk_queue.pop(); + } + for (auto i = topk_queue.begin(); i != topk_queue.end(); ++i) { + feature_vec.prefetch_node_feat((*(i + 1)).node_id); + pair_t cand_pair = (*i); + dist_t next_dist = feat_vec_t::distance( + query, + G0.get_node_feat(cand_pair.node_id) + ); + (*i).dist = next_dist; + } + std::sort(topk_queue.begin(), topk_queue.end()); + if (topk_queue.size() > topk) { + topk_queue.resize(topk); + } + return searcher.topk_queue; + } + + + + if (topk < efS) { + // remove extra when efS > topk + while (topk_queue.size() > topk) { + topk_queue.pop(); + } + } + std::sort_heap(topk_queue.begin(), topk_queue.end()); + return topk_queue; + } + + max_heap_t& search_level( + const feat_vec_t& query, + index_type init_node, + index_type efS, + index_type level, + Searcher& searcher, + std::vector* mtx_nodes=nullptr + ) const { + const auto *G0Q = &graph_l0_pq4; + searcher.reset(); + searcher.setup_lut(query.val); + max_heap_t& topk_queue = searcher.topk_queue; + min_heap_t& cand_queue = searcher.cand_queue; + + dist_t topk_ub_dist = feat_vec_t::distance( + query, + feature_vec.get_node_feat(init_node) + ); + topk_queue.emplace(topk_ub_dist, init_node); + cand_queue.emplace(topk_ub_dist, init_node); + searcher.mark_visited(init_node); + + while (!cand_queue.empty()) { + pair_t cand_pair = cand_queue.top(); + if (cand_pair.dist > topk_ub_dist) { + break; + } + cand_queue.pop(); + + index_type cand_node = cand_pair.node_id; + + // visiting neighbors of candidate node + const auto neighbors = G0Q->get_neighborhood(cand_node, level); + if (neighbors.degree() != 0) { + index_type max_j = neighbors.degree() - 1; + + searcher.approximate_distance(max_j + 1, G0Q->get_neighbor_codes(cand_node)); + for (index_type j = 0; j <= max_j; j++) { + auto next_node = neighbors[j]; + dist_t next_lb_dist = searcher.appx_dist[j]; + + if (topk_queue.size() < efS || next_lb_dist < topk_ub_dist) { + if (!searcher.is_visited(next_node)) { + searcher.mark_visited(next_node); + cand_queue.emplace(next_lb_dist, next_node); + topk_queue.emplace(next_lb_dist, next_node); + if (topk_queue.size() > efS) { + topk_queue.pop(); + } + if (!topk_queue.empty()) { + topk_ub_dist = topk_queue.top().dist; + } + } + } + } + } + + } + + + + return topk_queue; + } + }; +} // end of namespace ann +} // end of namespace pecos diff --git a/pecos/core/ann/quantizer.hpp b/pecos/core/ann/quantizer.hpp new file mode 100644 index 0000000..5e4f12a --- /dev/null +++ b/pecos/core/ann/quantizer.hpp @@ -0,0 +1,22 @@ +/* + * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance + * with the License. A copy of the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES + * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions + * and limitations under the License. + */ + + +#if defined(__x86_64__) || defined(__amd64__) + #include "quantizer_impl/x86.hpp" +#elif defined(__aarch64__) + #include "quantizer_impl/aarch64.hpp" +#else + #include "quantizer_impl/default.hpp" +#endif + diff --git a/pecos/core/ann/quantizer_impl/aarch64.hpp b/pecos/core/ann/quantizer_impl/aarch64.hpp new file mode 100644 index 0000000..d44571c --- /dev/null +++ b/pecos/core/ann/quantizer_impl/aarch64.hpp @@ -0,0 +1,16 @@ +/* + * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance + * with the License. A copy of the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES + * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions + * and limitations under the License. + */ + + +#pragma once +#include "default.hpp" diff --git a/pecos/core/ann/quantizer_impl/common.hpp b/pecos/core/ann/quantizer_impl/common.hpp new file mode 100644 index 0000000..c319bd3 --- /dev/null +++ b/pecos/core/ann/quantizer_impl/common.hpp @@ -0,0 +1,243 @@ +/* + * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance + * with the License. A copy of the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES + * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions + * and limitations under the License. + */ + +#include +#include +#include +#include "utils/clustering.hpp" + +namespace pecos { + +namespace ann { + + + struct ProductQuantizer4BitsBase { + // num_of_local_centroids denotes number of cluster centers used in quantization + // In 4 Bit case, it's a fixed to be 16 + const size_t num_of_local_centroids = 16; + // num_local_codebooks denotes number of local codebooks we have or in other words, + // number of subspace we have in Product Quantization. + // Supposedly, num_local_codebooks * local_dimension equals dimension of original data vector + index_type num_local_codebooks; + // local dimension denotes the dimensionality of subspace in Product Quantization + int local_dimension; + std::vector global_centroid; + std::vector local_codebooks; + std::vector original_local_codebooks; + + inline void save(FILE* fp) const { + pecos::file_util::fput_multiple(&num_local_codebooks, 1, fp); + pecos::file_util::fput_multiple(&local_dimension, 1, fp); + size_t sz = global_centroid.size(); + pecos::file_util::fput_multiple(&sz, 1, fp); + if (sz) { + pecos::file_util::fput_multiple(&global_centroid[0], sz, fp); + } + sz = original_local_codebooks.size(); + pecos::file_util::fput_multiple(&sz, 1, fp); + if (sz) { + pecos::file_util::fput_multiple(&original_local_codebooks[0], sz, fp); + } + sz = local_codebooks.size(); + pecos::file_util::fput_multiple(&sz, 1, fp); + if (sz) { + pecos::file_util::fput_multiple(&local_codebooks[0], sz, fp); + } + } + + inline void load(FILE* fp) { + pecos::file_util::fget_multiple(&num_local_codebooks, 1, fp); + pecos::file_util::fget_multiple(&local_dimension, 1, fp); + size_t sz = 0; + pecos::file_util::fget_multiple(&sz, 1, fp); + global_centroid.resize(sz); + if (sz) { + pecos::file_util::fget_multiple(&global_centroid[0], sz, fp); + } + pecos::file_util::fget_multiple(&sz, 1, fp); + original_local_codebooks.resize(sz); + if (sz) { + pecos::file_util::fget_multiple(&original_local_codebooks[0], sz, fp); + } + pecos::file_util::fget_multiple(&sz, 1, fp); + local_codebooks.resize(sz); + if (sz) { + pecos::file_util::fget_multiple(&local_codebooks[0], sz, fp); + } + } + + inline void pack_codebook_for_inference_default() { + local_codebooks = original_local_codebooks; + } + + inline void pad_parameters_default(index_type& max_degree, size_t& code_dimension) {} + + inline void approximate_neighbor_group_distance_default(size_t neighbor_size, float* ds, const char* neighbor_codes, uint8_t* lut_ptr, float scale, float bias) const { + index_type num_groups = neighbor_size % 16 == 0 ? neighbor_size / 16 : neighbor_size / 16 + 1; + + std::vector d(num_of_local_centroids); + int ptr = 0; + + const uint8_t *localID = reinterpret_cast(neighbor_codes); + for (index_type iters = 0; iters < num_groups; iters++) { + memset(d.data(), 0, sizeof(uint32_t) * num_of_local_centroids); + uint8_t* local_lut_ptr = lut_ptr; + for (index_type i = 0; i < num_local_codebooks; i++) { + for (size_t k = 0; k < num_of_local_centroids; k++) { + uint8_t obj = *localID; + if (k % 2 == 0) { + obj &= 0x0f; + } else { + obj >>= 4; + localID++; + } + d[k] += *(local_lut_ptr + obj); + } + + local_lut_ptr += num_of_local_centroids; + } + for (size_t k = 0; k < num_of_local_centroids; k++) { + ds[k + ptr] = d[k] * scale + bias; + } + ptr += num_of_local_centroids; + } + } + + inline void setup_lut_default(float* query, uint8_t* lut_ptr, float& scale, float& bias) const { + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); + // first iteration to calculate raw distance and max,min values for quantized lut + std::vector raw_dist(num_local_codebooks * num_of_local_centroids, 0); + std::vector qs(local_dimension); + for (index_type d = 0; d < num_local_codebooks; d++) { + for (int j = 0; j < local_dimension; j++) { + qs[j] = query[d * local_dimension + j] - global_centroid[d * local_dimension + j]; + } + for (size_t k = 0; k < num_of_local_centroids; k++) { + float tmp_v = 0; + for (int j = 0; j < local_dimension; j++) { + float v = (qs[j] - local_codebooks[d * num_of_local_centroids * local_dimension + k * local_dimension + j]); + tmp_v += (v * v); + } + raw_dist[d * num_of_local_centroids + k] = tmp_v; + max = std::max(max, tmp_v); + min = std::min(min, tmp_v); + } + } + + bias = min; + scale = (max - min) / 255.0; + // second iteration to calculate quantized distnace and put it into lut + for (index_type d = 0; d < num_local_codebooks; d++) { + for (size_t k = 0; k < num_of_local_centroids; k++) { + lut_ptr[d * num_of_local_centroids + k] = std::round((raw_dist[d * num_of_local_centroids + k] - bias) / scale); + } + } + } + + inline void encode(float* query, uint8_t* codes) { + for (index_type d = 0; d < num_local_codebooks; d++) { + std::vector results; + for (size_t k = 0; k < num_of_local_centroids; k++) { + float v = 0; + for (int j = 0; j < local_dimension; j++) { + float tmp_v = original_local_codebooks[d * num_of_local_centroids * local_dimension + k * local_dimension + j] + - (query[d * local_dimension + j] - global_centroid[d * local_dimension + j]); + v += (tmp_v * tmp_v); + } + results.push_back(v); + } + std::vector::iterator argmin_result = std::min_element(results.begin(), results.end()); + codes[d] = std::distance(results.begin(), argmin_result); + } + } + + inline void compute_centroids(pecos::drm_t& X, int dsub, size_t ksub, index_type *assign, float *centroids, int threads=1) { + // zero initialization for later do_axpy + memset(centroids, 0, ksub * dsub * sizeof(*centroids)); + std::vector centroids_size(ksub); + #pragma omp parallel num_threads(threads) + { + // each thread takes care of [c_l, c_r) + int rank = omp_get_thread_num(); + size_t c_l = (ksub * rank) / threads; + size_t c_r = (ksub * (rank + 1)) / threads; + for (size_t i = 0; i < X.rows; i++) { + auto ci = assign[i]; + if (ci >= c_l && ci < c_r) { + float* y = centroids + ci * dsub; + const auto& xi = X.get_row(i); + pecos::do_axpy(1.0, xi.val, y, dsub); + centroids_size[ci] += 1; + } + } + // normalize center vector + for (size_t ci = c_l; ci < c_r; ci++) { + float* y = centroids + ci * dsub; + pecos::do_scale(1.0 / centroids_size[ci], y, dsub); + } + } + } + + inline void train(const pecos::drm_t& X_trn, index_type num_local_codebooks, size_t sub_sample_points=0, int seed=0, size_t max_iter=10, int threads=32) { + size_t dimension = X_trn.cols; + if (dimension % num_local_codebooks != 0) { + throw std::runtime_error("Original dimension must be divided by subspace dimension"); + } + this->num_local_codebooks = num_local_codebooks; + local_dimension = dimension / num_local_codebooks; + index_type n_data = X_trn.rows; + if (sub_sample_points == 0) { + sub_sample_points = n_data; + } + + std::vector centroids; + original_local_codebooks.resize(num_local_codebooks * num_of_local_centroids * dimension); + global_centroid.resize(dimension, 0); + + std::vector xslice(sub_sample_points * local_dimension); + for (index_type m = 0; m < num_local_codebooks; m++) { + std::vector indices(n_data, 0); + std::iota(indices.data(), indices.data() + n_data, 0); + std::random_shuffle(indices.data(), indices.data() + n_data); + for (size_t i = 0; i < sub_sample_points; i++) { + size_t index = indices[i]; + std::memcpy(xslice.data() + i * local_dimension, X_trn.val + index * dimension + m * local_dimension, local_dimension * sizeof(float)); + } + pecos::drm_t Xsub; + Xsub.rows = sub_sample_points; + Xsub.cols = local_dimension; + Xsub.val = xslice.data(); + + // fit HLT or flat-Kmeans for each sub-space + std::vector assignments(sub_sample_points); + pecos::clustering::Tree hlt(std::log2(num_of_local_centroids)); + hlt.run_clustering( + Xsub, + 0, + seed, + assignments.data(), + max_iter, + threads); + + compute_centroids(Xsub, local_dimension, num_of_local_centroids, assignments.data(), + &original_local_codebooks[m * num_of_local_centroids * local_dimension], threads); + } + } + + }; + + +} // end of namespace ann +} // end of namespace pecos + diff --git a/pecos/core/ann/quantizer_impl/default.hpp b/pecos/core/ann/quantizer_impl/default.hpp new file mode 100644 index 0000000..12b6e4b --- /dev/null +++ b/pecos/core/ann/quantizer_impl/default.hpp @@ -0,0 +1,42 @@ +/* + * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance + * with the License. A copy of the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES + * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions + * and limitations under the License. + */ + + +#pragma once +#include "common.hpp" + +namespace pecos { + +namespace ann { + + struct ProductQuantizer4Bits : ProductQuantizer4BitsBase { + void pack_codebook_for_inference() { + pack_codebook_for_inference_default(); + } + + void pad_parameters(index_type& max_degree, size_t& code_dimension) { + pad_parameters_default(max_degree, code_dimension); + } + + inline void approximate_neighbor_group_distance(size_t neighbor_size, float* ds, const char* neighbor_codes, uint8_t* lut_ptr, float scale, float bias) const { + approximate_neighbor_group_distance_default(neighbor_size, ds, neighbor_codes, lut_ptr, scale, bias); + } + + inline void setup_lut(float* query, uint8_t* lut_ptr, float& scale, float& bias) const { + setup_lut_default(query, lut_ptr, scale, bias); + } + }; + +} // end of namespace ann +} // end of namespace pecos + diff --git a/pecos/core/ann/quantizer_impl/x86.hpp b/pecos/core/ann/quantizer_impl/x86.hpp new file mode 100644 index 0000000..3f32ddf --- /dev/null +++ b/pecos/core/ann/quantizer_impl/x86.hpp @@ -0,0 +1,228 @@ +/* + * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance + * with the License. A copy of the License is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES + * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions + * and limitations under the License. + */ + +#pragma once +#if defined(__GNUC__) +#define PORTABLE_ALIGN32 __attribute__((aligned(32))) +#else +#define PORTABLE_ALIGN32 __declspec(align(32)) +#endif + +#pragma once +#ifndef NO_MANUAL_VECTORIZATION +#ifdef __SSE__ +#define USE_SSE +#ifdef __AVX__ +#define USE_AVX +#endif +#endif +#endif + +#if defined(USE_AVX) || defined(USE_SSE) +#ifdef _MSC_VER +#include +#include +#else +#include +#endif +#endif + +#include +#include +#include +#pragma once +#include "common.hpp" +#include "utils/clustering.hpp" + +namespace pecos { + +namespace ann { + + struct ProductQuantizer4Bits : ProductQuantizer4BitsBase { + + __attribute__((__target__("default"))) + void pack_codebook_for_inference() { + pack_codebook_for_inference_default(); + } + + __attribute__((__target__("avx512f"))) + void pack_codebook_for_inference() { + local_codebooks.resize(original_local_codebooks.size(), 0); + for (index_type i = 0; i < num_local_codebooks; i++) { + for (size_t j = 0; j < num_of_local_centroids; j++) { + for (int k = 0; k < local_dimension; k++) { + local_codebooks[i * num_of_local_centroids * local_dimension + k * num_of_local_centroids + j] + = original_local_codebooks[i * num_of_local_centroids * local_dimension + j * local_dimension + k]; + } + } + } + } + + __attribute__((__target__("default"))) + void pad_parameters(index_type& max_degree, size_t& code_dimension) { + pad_parameters_default(max_degree, code_dimension); + } + + __attribute__((__target__("avx512f"))) + void pad_parameters(index_type& max_degree, size_t& code_dimension) { + // When using AVX512f, we have 16 centroids per local codebook, and each of it uses 8 bits to represent quantized + // distance value. Thus, we will have 128 bits to load 1 set of local codebooks. Thus, a loadu_si512 will load + // 512 / 128 == 4 local codebooks at a time. Thus, will need to be adjust the code_dimension to make it divisible + // by 4. If it's not, we have to pad 0 to extended dimensions. + code_dimension = code_dimension % 4 == 0 ? code_dimension : (code_dimension / 4 + 1) * 4; + + // AVX512f will process 16 operations in parallel, and due to memory layout issues, to achieve fast memory load, + // we have to process all dimensions of 16 members first then move to next 16 neighbors. Therefore, we have to make + // sure the max_degree is a multiple of 16 so there will be no segmentation faults when reading codes from graph. + max_degree = max_degree % 16 == 0 ? max_degree : (max_degree / 16 + 1) * 16; + } + + __attribute__((__target__("avx512f"))) + inline void approximate_neighbor_group_distance(size_t neighbor_size, float* ds, const char* neighbor_codes, uint8_t* lut_ptr, float scale, float bias) const { + // When using AVX512f, we have 16 centroids per local codebook, and each of it uses 8 bits to represent quantized + // distance value. Thus, we will have 128 bits to load 1 set of local codebooks. Thus, a loadu_si512 will load + // 512 / 128 == 4 local codebooks at a time. Thus, number of loadu_si512 perfomred on a group will need to be + // adjusted baed on if num_local_codebooks is divisible by 4. + index_type num_dimension_block = num_local_codebooks % 4 == 0 ? num_local_codebooks / 4 : num_local_codebooks / 4 + 1; + + // Similarly, we have to parse every 16 neighbors at a time to maximally leverage avx512f. + // Thus we can also calculate the number of group iterations based on if neighbor_size id + // divisible by 16. + index_type num_groups = neighbor_size % 16 == 0 ? neighbor_size / 16 : neighbor_size / 16 + 1; + + const uint8_t *localID = reinterpret_cast(neighbor_codes); + float* d = ds; + + for (index_type iters = 0; iters < num_groups; iters++) { + uint8_t *local_lut_ptr = lut_ptr; + __m512i sum_result = _mm512_setzero_si512(); + + for (index_type r = 0; r < num_dimension_block; r++) { + __m512i lookup_table = _mm512_loadu_si512((__m512i const*)local_lut_ptr); + // Each time, avx512f will load 4 x 16 entries from lookup table. We + // prefech the position which will be accessed 8 rounds later. + _mm_prefetch(&localID[0] + 64 * 8, _MM_HINT_T0); + __m512i packedobj = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i const*)localID)); + __m512i mask512x0F = _mm512_set1_epi16(0x000f); // #32 of 16 bits mask + __m512i mask512xF0 = _mm512_set1_epi16(0x00f0); + __m512i lo = _mm512_and_si512(packedobj, mask512x0F); + __m512i hi = _mm512_slli_epi16(_mm512_and_si512(packedobj, mask512xF0), 4); + __m512i obj = _mm512_or_si512(lo, hi); + __m512i vtmp = _mm512_shuffle_epi8(lookup_table, obj); + + sum_result = _mm512_adds_epu16(sum_result, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(vtmp, 0))); + sum_result = _mm512_adds_epu16(sum_result, _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(vtmp, 1))); + + // Essentially, avx512 will process 4 x 16 entries of lookup table at a time + // Thus, we want to move 64 positions after a round of process. + // The reason why neighbor id only moves 32 positions is due to the fact that + // 2 neighbor code is saved in a byte, and will be expanded via avx512f operations. + // So we only need to move 32 positions and effectively extract 64 codes. + local_lut_ptr += 64; + localID += 32; + } + + __m512i lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(sum_result, 0)); + __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(sum_result, 1)); + __m512 distance = _mm512_cvtepi32_ps(_mm512_add_epi32(lo, hi)); + + __m512 _scale = _mm512_set1_ps(scale); + __m512 _bias = _mm512_set1_ps(bias); + distance = _mm512_mul_ps(distance, _scale); + distance = _mm512_add_ps(distance, _bias); + + _mm512_storeu_ps(d, distance); + d += 16; + } + } + + __attribute__((__target__("default"))) + inline void approximate_neighbor_group_distance(size_t neighbor_size, float* ds, const char* neighbor_codes, uint8_t* lut_ptr, float scale, float bias) const { + approximate_neighbor_group_distance_default(neighbor_size, ds, neighbor_codes, lut_ptr, scale, bias); + } + + __attribute__((__target__("default"))) + inline void setup_lut(float* query, uint8_t* lut_ptr, float& scale, float& bias) const { + setup_lut_default(query, lut_ptr, scale, bias); + } + + __attribute__((__target__("avx512f"))) + inline void setup_lut(float* query, uint8_t* lut_ptr, float& scale, float& bias) const { + int original_dimension = num_local_codebooks * local_dimension; + + // avx512f can load 16 float32 in parallel. Thus we need to calculate + // how many round of computation needed to get redisual vector + int global_blocks = original_dimension % 16 == 0 ? original_dimension / 16 : original_dimension / 16 + 1; + int extend_dimension = global_blocks * 16; + std::vector centered_query(extend_dimension, 0); + float min = std::numeric_limits::max(); + float max = std::numeric_limits::min(); + + // first iteration to calculate raw distance and max,min values for quantized lut + const float *globalID = global_centroid.data(); + const float *localID = local_codebooks.data(); + const float *query_ptr = ¢ered_query[0]; + std::vector raw_dist(num_local_codebooks * num_of_local_centroids, 0); + + // AVX512 on centering query + for (int i = 0; i < global_blocks; i++) { + __m512 q = _mm512_loadu_ps(query); + __m512 g = _mm512_loadu_ps(globalID); + _mm512_storeu_ps(¢ered_query[i * 16], _mm512_sub_ps(q, g)); + // AVX512 read 16 floats at a time so query and globalID will move 16 positions after a round + query += 16; + globalID += 16; + } + + for (index_type d = 0; d < num_local_codebooks; d++) { + __m512 tmp_v = _mm512_setzero_ps(); + // prefect data used in the next round. It's currently decided by experience and observance of good empirical + // results. The best prefetch position could be determined by a more complete empirical study. + _mm_prefetch(localID + local_dimension * 16, _MM_HINT_T0); + _mm_prefetch(raw_dist.data() + d * num_of_local_centroids, _MM_HINT_T0); + + for (int j = 0; j < local_dimension; j++) { + __m512 q = _mm512_set1_ps(*query_ptr++); + __m512 l = _mm512_loadu_ps(localID); + __m512 v = _mm512_sub_ps(q, l); + v = _mm512_mul_ps(v, v); + tmp_v = _mm512_add_ps(tmp_v, v); + // AVX512 read 16 floats at a time so locaiID will move 16 positions after a round + localID += 16; + } + _mm512_storeu_ps(&raw_dist[d * num_of_local_centroids], tmp_v); + max = std::max(max, _mm512_reduce_max_ps(tmp_v)); + min = std::min(min, _mm512_reduce_min_ps(tmp_v)); + } + + bias = min; + scale = (max - min) / 255.0; + __m512 _scale = _mm512_set1_ps(scale); + __m512 _bias = _mm512_set1_ps(bias); + auto *raw_ptr = raw_dist.data(); + for (index_type d = 0; d < num_local_codebooks; d++) { + __m512 raw_table = _mm512_loadu_ps(raw_ptr); + raw_table = _mm512_sub_ps(raw_table, _bias); + raw_table = _mm512_div_ps(raw_table, _scale); + raw_table = _mm512_roundscale_ps(raw_table, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128(reinterpret_cast<__m128i*>(lut_ptr), _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(raw_table))); + // AVX512 read 16 floats at a time so raw distance table and final lookup table will move 16 positions after a round + raw_ptr += 16; + lut_ptr += 16; + } + } + }; + +} // end of namespace ann +} // end of namespace pecos +