diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index f88907d397..291b225cd7 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -72,6 +72,7 @@ set(FAISS_SRC impl/pq4_fast_scan.cpp impl/pq4_fast_scan_search_1.cpp impl/pq4_fast_scan_search_qbs.cpp + impl/residual_quantizer_encode_steps.cpp impl/io.cpp impl/lattice_Zn.cpp impl/NNDescent.cpp @@ -171,6 +172,7 @@ set(FAISS_HEADERS impl/lattice_Zn.h impl/platform_macros.h impl/pq4_fast_scan.h + impl/residual_quantizer_encode_steps.h impl/simd_result_handlers.h impl/code_distance/code_distance.h impl/code_distance/code_distance-generic.h diff --git a/faiss/IndexAdditiveQuantizer.cpp b/faiss/IndexAdditiveQuantizer.cpp index 153b766e4a..ffd4c524f6 100644 --- a/faiss/IndexAdditiveQuantizer.cpp +++ b/faiss/IndexAdditiveQuantizer.cpp @@ -5,9 +5,6 @@ * LICENSE file in the root directory of this source tree. */ -// quiet the noise -// clang-format off - #include #include @@ -21,7 +18,6 @@ #include #include - namespace faiss { /************************************************************************************** @@ -29,15 +25,13 @@ namespace faiss { **************************************************************************************/ IndexAdditiveQuantizer::IndexAdditiveQuantizer( - idx_t d, - AdditiveQuantizer* aq, - MetricType metric): - IndexFlatCodes(aq->code_size, d, metric), aq(aq) -{ + idx_t d, + AdditiveQuantizer* aq, + MetricType metric) + : IndexFlatCodes(aq->code_size, d, metric), aq(aq) { FAISS_THROW_IF_NOT(metric == METRIC_INNER_PRODUCT || metric == METRIC_L2); } - namespace { /************************************************************ @@ -45,21 +39,22 @@ namespace { ************************************************************/ template -struct AQDistanceComputerDecompress: FlatCodesDistanceComputer { +struct AQDistanceComputerDecompress : FlatCodesDistanceComputer { std::vector tmp; - const AdditiveQuantizer & aq; + const AdditiveQuantizer& aq; VectorDistance vd; size_t d; - AQDistanceComputerDecompress(const IndexAdditiveQuantizer &iaq, VectorDistance vd): - FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size), - tmp(iaq.d * 2), - aq(*iaq.aq), - vd(vd), - d(iaq.d) - {} + AQDistanceComputerDecompress( + const IndexAdditiveQuantizer& iaq, + VectorDistance vd) + : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size), + tmp(iaq.d * 2), + aq(*iaq.aq), + vd(vd), + d(iaq.d) {} - const float *q; + const float* q; void set_query(const float* x) final { q = x; } @@ -70,7 +65,7 @@ struct AQDistanceComputerDecompress: FlatCodesDistanceComputer { return vd(tmp.data(), tmp.data() + d); } - float distance_to_code(const uint8_t *code) final { + float distance_to_code(const uint8_t* code) final { aq.decode(code, tmp.data(), 1); return vd(q, tmp.data()); } @@ -78,19 +73,17 @@ struct AQDistanceComputerDecompress: FlatCodesDistanceComputer { virtual ~AQDistanceComputerDecompress() {} }; - -template -struct AQDistanceComputerLUT: FlatCodesDistanceComputer { +template +struct AQDistanceComputerLUT : FlatCodesDistanceComputer { std::vector LUT; - const AdditiveQuantizer & aq; + const AdditiveQuantizer& aq; size_t d; - explicit AQDistanceComputerLUT(const IndexAdditiveQuantizer &iaq): - FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size), - LUT(iaq.aq->total_codebook_size + iaq.d * 2), - aq(*iaq.aq), - d(iaq.d) - {} + explicit AQDistanceComputerLUT(const IndexAdditiveQuantizer& iaq) + : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size), + LUT(iaq.aq->total_codebook_size + iaq.d * 2), + aq(*iaq.aq), + d(iaq.d) {} float bias; void set_query(const float* x) final { @@ -104,26 +97,23 @@ struct AQDistanceComputerLUT: FlatCodesDistanceComputer { } float symmetric_dis(idx_t i, idx_t j) final { - float *tmp = LUT.data(); + float* tmp = LUT.data(); aq.decode(codes + i * d, tmp, 1); aq.decode(codes + j * d, tmp + d, 1); return fvec_L2sqr(tmp, tmp + d, d); } - float distance_to_code(const uint8_t *code) final { + float distance_to_code(const uint8_t* code) final { return bias + aq.compute_1_distance_LUT(code, LUT.data()); } virtual ~AQDistanceComputerLUT() {} }; - - /************************************************************ * scanning implementation for search ************************************************************/ - template void search_with_decompress( const IndexAdditiveQuantizer& ir, @@ -133,11 +123,11 @@ void search_with_decompress( const uint8_t* codes = ir.codes.data(); size_t ntotal = ir.ntotal; size_t code_size = ir.code_size; - const AdditiveQuantizer *aq = ir.aq; + const AdditiveQuantizer* aq = ir.aq; using SingleResultHandler = typename ResultHandler::SingleResultHandler; -#pragma omp parallel for if(res.nq > 100) +#pragma omp parallel for if (res.nq > 100) for (int64_t q = 0; q < res.nq; q++) { SingleResultHandler resi(res); resi.begin(q); @@ -152,13 +142,12 @@ void search_with_decompress( } } -template +template void search_with_LUT( const IndexAdditiveQuantizer& ir, const float* xq, - ResultHandler& res) -{ - const AdditiveQuantizer & aq = *ir.aq; + ResultHandler& res) { + const AdditiveQuantizer& aq = *ir.aq; const uint8_t* codes = ir.codes.data(); size_t ntotal = ir.ntotal; size_t code_size = aq.code_size; @@ -166,38 +155,34 @@ void search_with_LUT( size_t d = ir.d; using SingleResultHandler = typename ResultHandler::SingleResultHandler; - std::unique_ptr LUT(new float[nq * aq.total_codebook_size]); + std::unique_ptr LUT(new float[nq * aq.total_codebook_size]); aq.compute_LUT(nq, xq, LUT.get()); -#pragma omp parallel for if(nq > 100) +#pragma omp parallel for if (nq > 100) for (int64_t q = 0; q < nq; q++) { SingleResultHandler resi(res); resi.begin(q); std::vector tmp(aq.d); - const float *LUT_q = LUT.get() + aq.total_codebook_size * q; + const float* LUT_q = LUT.get() + aq.total_codebook_size * q; float bias = 0; - if (!is_IP) { // the LUT function returns ||y||^2 - 2 * , need to add ||x||^2 + if (!is_IP) { // the LUT function returns ||y||^2 - 2 * , need to + // add ||x||^2 bias = fvec_norm_L2sqr(xq + q * d, d); } for (size_t i = 0; i < ntotal; i++) { float dis = aq.compute_1_distance_LUT( - codes + i * code_size, - LUT_q - ); + codes + i * code_size, LUT_q); resi.add_result(dis + bias, i); } resi.end(); } - } - } // anonymous namespace - -FlatCodesDistanceComputer * IndexAdditiveQuantizer::get_FlatCodesDistanceComputer() const { - +FlatCodesDistanceComputer* IndexAdditiveQuantizer:: + get_FlatCodesDistanceComputer() const { if (aq->search_type == AdditiveQuantizer::ST_decompress) { if (metric_type == METRIC_L2) { using VD = VectorDistance; @@ -212,34 +197,36 @@ FlatCodesDistanceComputer * IndexAdditiveQuantizer::get_FlatCodesDistanceCompute } } else { if (metric_type == METRIC_INNER_PRODUCT) { - return new AQDistanceComputerLUT(*this); + return new AQDistanceComputerLUT< + true, + AdditiveQuantizer::ST_LUT_nonorm>(*this); } else { - switch(aq->search_type) { -#define DISPATCH(st) \ - case AdditiveQuantizer::st: \ - return new AQDistanceComputerLUT (*this);\ - break; - DISPATCH(ST_norm_float) - DISPATCH(ST_LUT_nonorm) - DISPATCH(ST_norm_qint8) - DISPATCH(ST_norm_qint4) - DISPATCH(ST_norm_cqint4) - case AdditiveQuantizer::ST_norm_cqint8: - case AdditiveQuantizer::ST_norm_lsq2x4: - case AdditiveQuantizer::ST_norm_rq2x4: - return new AQDistanceComputerLUT (*this);\ - break; + switch (aq->search_type) { +#define DISPATCH(st) \ + case AdditiveQuantizer::st: \ + return new AQDistanceComputerLUT(*this); \ + break; + DISPATCH(ST_norm_float) + DISPATCH(ST_LUT_nonorm) + DISPATCH(ST_norm_qint8) + DISPATCH(ST_norm_qint4) + DISPATCH(ST_norm_cqint4) + case AdditiveQuantizer::ST_norm_cqint8: + case AdditiveQuantizer::ST_norm_lsq2x4: + case AdditiveQuantizer::ST_norm_rq2x4: + return new AQDistanceComputerLUT< + false, + AdditiveQuantizer::ST_norm_cqint8>(*this); + break; #undef DISPATCH - default: - FAISS_THROW_FMT("search type %d not supported", aq->search_type); + default: + FAISS_THROW_FMT( + "search type %d not supported", aq->search_type); } } } } - - - void IndexAdditiveQuantizer::search( idx_t n, const float* x, @@ -247,8 +234,8 @@ void IndexAdditiveQuantizer::search( float* distances, idx_t* labels, const SearchParameters* params) const { - - FAISS_THROW_IF_NOT_MSG(!params, "search params not supported for this index"); + FAISS_THROW_IF_NOT_MSG( + !params, "search params not supported for this index"); if (aq->search_type == AdditiveQuantizer::ST_decompress) { if (metric_type == METRIC_L2) { @@ -264,45 +251,46 @@ void IndexAdditiveQuantizer::search( } } else { if (metric_type == METRIC_INNER_PRODUCT) { - HeapResultHandler > rh(n, distances, labels, k); - search_with_LUT (*this, x, rh); + HeapResultHandler> rh(n, distances, labels, k); + search_with_LUT( + *this, x, rh); } else { - HeapResultHandler > rh(n, distances, labels, k); - switch(aq->search_type) { -#define DISPATCH(st) \ - case AdditiveQuantizer::st: \ - search_with_LUT (*this, x, rh);\ - break; - DISPATCH(ST_norm_float) - DISPATCH(ST_LUT_nonorm) - DISPATCH(ST_norm_qint8) - DISPATCH(ST_norm_qint4) - DISPATCH(ST_norm_cqint4) - case AdditiveQuantizer::ST_norm_cqint8: - case AdditiveQuantizer::ST_norm_lsq2x4: - case AdditiveQuantizer::ST_norm_rq2x4: - search_with_LUT (*this, x, rh); - break; + HeapResultHandler> rh(n, distances, labels, k); + switch (aq->search_type) { +#define DISPATCH(st) \ + case AdditiveQuantizer::st: \ + search_with_LUT(*this, x, rh); \ + break; + DISPATCH(ST_norm_float) + DISPATCH(ST_LUT_nonorm) + DISPATCH(ST_norm_qint8) + DISPATCH(ST_norm_qint4) + DISPATCH(ST_norm_cqint4) + case AdditiveQuantizer::ST_norm_cqint8: + case AdditiveQuantizer::ST_norm_lsq2x4: + case AdditiveQuantizer::ST_norm_rq2x4: + search_with_LUT( + *this, x, rh); + break; #undef DISPATCH - default: - FAISS_THROW_FMT("search type %d not supported", aq->search_type); + default: + FAISS_THROW_FMT( + "search type %d not supported", aq->search_type); } } - } } -void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const { +void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) + const { return aq->compute_codes(x, bytes, n); } -void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) const { +void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) + const { return aq->decode(bytes, x, n); } - - - /************************************************************************************** * IndexResidualQuantizer **************************************************************************************/ @@ -313,8 +301,11 @@ IndexResidualQuantizer::IndexResidualQuantizer( size_t nbits, ///< number of bit per subvector index MetricType metric, Search_type_t search_type) - : IndexResidualQuantizer(d, std::vector(M, nbits), metric, search_type) { -} + : IndexResidualQuantizer( + d, + std::vector(M, nbits), + metric, + search_type) {} IndexResidualQuantizer::IndexResidualQuantizer( int d, @@ -326,14 +317,14 @@ IndexResidualQuantizer::IndexResidualQuantizer( is_trained = false; } -IndexResidualQuantizer::IndexResidualQuantizer() : IndexResidualQuantizer(0, 0, 0) {} +IndexResidualQuantizer::IndexResidualQuantizer() + : IndexResidualQuantizer(0, 0, 0) {} void IndexResidualQuantizer::train(idx_t n, const float* x) { rq.train(n, x); is_trained = true; } - /************************************************************************************** * IndexLocalSearchQuantizer **************************************************************************************/ @@ -344,31 +335,33 @@ IndexLocalSearchQuantizer::IndexLocalSearchQuantizer( size_t nbits, ///< number of bit per subvector index MetricType metric, Search_type_t search_type) - : IndexAdditiveQuantizer(d, &lsq, metric), lsq(d, M, nbits, search_type) { + : IndexAdditiveQuantizer(d, &lsq, metric), + lsq(d, M, nbits, search_type) { code_size = lsq.code_size; is_trained = false; } -IndexLocalSearchQuantizer::IndexLocalSearchQuantizer() : IndexLocalSearchQuantizer(0, 0, 0) {} +IndexLocalSearchQuantizer::IndexLocalSearchQuantizer() + : IndexLocalSearchQuantizer(0, 0, 0) {} void IndexLocalSearchQuantizer::train(idx_t n, const float* x) { lsq.train(n, x); is_trained = true; } - /************************************************************************************** * IndexProductResidualQuantizer **************************************************************************************/ IndexProductResidualQuantizer::IndexProductResidualQuantizer( - int d, ///< dimensionality of the input vectors + int d, ///< dimensionality of the input vectors size_t nsplits, ///< number of residual quantizers - size_t Msub, ///< number of subquantizers per RQ - size_t nbits, ///< number of bit per subvector index + size_t Msub, ///< number of subquantizers per RQ + size_t nbits, ///< number of bit per subvector index MetricType metric, Search_type_t search_type) - : IndexAdditiveQuantizer(d, &prq, metric), prq(d, nsplits, Msub, nbits, search_type) { + : IndexAdditiveQuantizer(d, &prq, metric), + prq(d, nsplits, Msub, nbits, search_type) { code_size = prq.code_size; is_trained = false; } @@ -381,19 +374,19 @@ void IndexProductResidualQuantizer::train(idx_t n, const float* x) { is_trained = true; } - /************************************************************************************** * IndexProductLocalSearchQuantizer **************************************************************************************/ IndexProductLocalSearchQuantizer::IndexProductLocalSearchQuantizer( - int d, ///< dimensionality of the input vectors + int d, ///< dimensionality of the input vectors size_t nsplits, ///< number of local search quantizers - size_t Msub, ///< number of subquantizers per LSQ - size_t nbits, ///< number of bit per subvector index + size_t Msub, ///< number of subquantizers per LSQ + size_t nbits, ///< number of bit per subvector index MetricType metric, Search_type_t search_type) - : IndexAdditiveQuantizer(d, &plsq, metric), plsq(d, nsplits, Msub, nbits, search_type) { + : IndexAdditiveQuantizer(d, &plsq, metric), + plsq(d, nsplits, Msub, nbits, search_type) { code_size = plsq.code_size; is_trained = false; } @@ -406,17 +399,15 @@ void IndexProductLocalSearchQuantizer::train(idx_t n, const float* x) { is_trained = true; } - /************************************************************************************** * AdditiveCoarseQuantizer **************************************************************************************/ AdditiveCoarseQuantizer::AdditiveCoarseQuantizer( - idx_t d, - AdditiveQuantizer* aq, - MetricType metric): - Index(d, metric), aq(aq) -{} + idx_t d, + AdditiveQuantizer* aq, + MetricType metric) + : Index(d, metric), aq(aq) {} void AdditiveCoarseQuantizer::add(idx_t, const float*) { FAISS_THROW_MSG("not applicable"); @@ -430,17 +421,16 @@ void AdditiveCoarseQuantizer::reset() { FAISS_THROW_MSG("not applicable"); } - void AdditiveCoarseQuantizer::train(idx_t n, const float* x) { if (verbose) { - printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n", size_t(n)); + printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n", + size_t(n)); } size_t norms_size = sizeof(float) << aq->tot_bits; - FAISS_THROW_IF_NOT_MSG ( - norms_size <= aq->max_mem_distances, - "the RCQ norms matrix will become too large, please reduce the number of quantization steps" - ); + FAISS_THROW_IF_NOT_MSG( + norms_size <= aq->max_mem_distances, + "the RCQ norms matrix will become too large, please reduce the number of quantization steps"); aq->train(n, x); is_trained = true; @@ -448,7 +438,8 @@ void AdditiveCoarseQuantizer::train(idx_t n, const float* x) { if (metric_type == METRIC_L2) { if (verbose) { - printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal)); + printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", + size_t(ntotal)); } // this is not necessary for the residualcoarsequantizer when // using beam search. We'll see if the memory overhead is too high @@ -463,16 +454,15 @@ void AdditiveCoarseQuantizer::search( idx_t k, float* distances, idx_t* labels, - const SearchParameters * params) const { - - FAISS_THROW_IF_NOT_MSG(!params, "search params not supported for this index"); + const SearchParameters* params) const { + FAISS_THROW_IF_NOT_MSG( + !params, "search params not supported for this index"); if (metric_type == METRIC_INNER_PRODUCT) { aq->knn_centroids_inner_product(n, x, k, distances, labels); } else if (metric_type == METRIC_L2) { FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal); - aq->knn_centroids_L2( - n, x, k, distances, labels, centroid_norms.data()); + aq->knn_centroids_L2(n, x, k, distances, labels, centroid_norms.data()); } } @@ -481,7 +471,7 @@ void AdditiveCoarseQuantizer::search( **************************************************************************************/ ResidualCoarseQuantizer::ResidualCoarseQuantizer( - int d, ///< dimensionality of the input vectors + int d, ///< dimensionality of the input vectors const std::vector& nbits, MetricType metric) : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits) { @@ -496,21 +486,30 @@ ResidualCoarseQuantizer::ResidualCoarseQuantizer( MetricType metric) : ResidualCoarseQuantizer(d, std::vector(M, nbits), metric) {} -ResidualCoarseQuantizer::ResidualCoarseQuantizer(): ResidualCoarseQuantizer(0, 0, 0) {} - - +ResidualCoarseQuantizer::ResidualCoarseQuantizer() + : ResidualCoarseQuantizer(0, 0, 0) {} void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) { beam_factor = new_beam_factor; if (new_beam_factor > 0) { FAISS_THROW_IF_NOT(new_beam_factor >= 1.0); + if (rq.codebook_cross_products.size() == 0) { + rq.compute_codebook_tables(); + } return; - } else if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) { - if (verbose) { - printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal)); + } else { + // new_beam_factor = -1: exhaustive computation. + // Does not use the cross_products + rq.codebook_cross_products.resize(0); + // but the centroid norms are necessary! + if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) { + if (verbose) { + printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", + size_t(ntotal)); + } + centroid_norms.resize(ntotal); + aq->compute_centroid_norms(centroid_norms.data()); } - centroid_norms.resize(ntotal); - aq->compute_centroid_norms(centroid_norms.data()); } } @@ -520,13 +519,15 @@ void ResidualCoarseQuantizer::search( idx_t k, float* distances, idx_t* labels, - const SearchParameters * params_in - ) const { - + const SearchParameters* params_in) const { float beam_factor = this->beam_factor; if (params_in) { - auto params = dynamic_cast(params_in); - FAISS_THROW_IF_NOT_MSG(params, "need SearchParametersResidualCoarseQuantizer parameters"); + auto params = + dynamic_cast( + params_in); + FAISS_THROW_IF_NOT_MSG( + params, + "need SearchParametersResidualCoarseQuantizer parameters"); beam_factor = params->beam_factor; } @@ -571,6 +572,7 @@ void ResidualCoarseQuantizer::search( rq.refine_beam( n, 1, x, beam_size, codes.data(), nullptr, beam_distances.data()); + // pack int32 table #pragma omp parallel for if (n > 4000) for (idx_t i = 0; i < n; i++) { memcpy(distances + i * k, @@ -590,7 +592,8 @@ void ResidualCoarseQuantizer::search( } } -void ResidualCoarseQuantizer::initialize_from(const ResidualCoarseQuantizer &other) { +void ResidualCoarseQuantizer::initialize_from( + const ResidualCoarseQuantizer& other) { FAISS_THROW_IF_NOT(rq.M <= other.rq.M); rq.initialize_from(other.rq); set_beam_factor(other.beam_factor); @@ -598,7 +601,6 @@ void ResidualCoarseQuantizer::initialize_from(const ResidualCoarseQuantizer &oth ntotal = (idx_t)1 << aq->tot_bits; } - /************************************************************************************** * LocalSearchCoarseQuantizer **************************************************************************************/ @@ -613,12 +615,8 @@ LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer( is_trained = false; } - LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer() { aq = &lsq; } - - - } // namespace faiss diff --git a/faiss/impl/AdditiveQuantizer.cpp b/faiss/impl/AdditiveQuantizer.cpp index ff6eb4a98a..c39d870e6d 100644 --- a/faiss/impl/AdditiveQuantizer.cpp +++ b/faiss/impl/AdditiveQuantizer.cpp @@ -370,6 +370,8 @@ void AdditiveQuantizer::compute_LUT( namespace { +/* compute inner products of one query with all centroids, given a look-up + * table of all inner producst with codebook entries */ void compute_inner_prod_with_LUT( const AdditiveQuantizer& aq, const float* LUT, diff --git a/faiss/impl/AdditiveQuantizer.h b/faiss/impl/AdditiveQuantizer.h index 289d205299..054b5c7677 100644 --- a/faiss/impl/AdditiveQuantizer.h +++ b/faiss/impl/AdditiveQuantizer.h @@ -49,11 +49,13 @@ struct AdditiveQuantizer : Quantizer { /// encode a norm into norm_bits bits uint64_t encode_norm(float norm) const; + /// encode norm by non-uniform scalar quantization uint32_t encode_qcint( - float x) const; ///< encode norm by non-uniform scalar quantization + float x) const; + /// decode norm by non-uniform scalar quantization float decode_qcint(uint32_t c) - const; ///< decode norm by non-uniform scalar quantization + const; /// Encodes how search is performed and how vectors are encoded enum Search_type_t { diff --git a/faiss/impl/ResidualQuantizer.cpp b/faiss/impl/ResidualQuantizer.cpp index fa8248a60a..2158567859 100644 --- a/faiss/impl/ResidualQuantizer.cpp +++ b/faiss/impl/ResidualQuantizer.cpp @@ -16,17 +16,12 @@ #include #include -#include #include -#include +#include #include #include #include -#include - -#include - extern "C" { // general matrix multiplication @@ -125,157 +120,9 @@ void ResidualQuantizer::initialize_from( } } -void beam_search_encode_step( - size_t d, - size_t K, - const float* cent, /// size (K, d) - size_t n, - size_t beam_size, - const float* residuals, /// size (n, beam_size, d) - size_t m, - const int32_t* codes, /// size (n, beam_size, m) - size_t new_beam_size, - int32_t* new_codes, /// size (n, new_beam_size, m + 1) - float* new_residuals, /// size (n, new_beam_size, d) - float* new_distances, /// size (n, new_beam_size) - Index* assign_index, - ApproxTopK_mode_t approx_topk_mode) { - // we have to fill in the whole output matrix - FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K); - - std::vector cent_distances; - std::vector cent_ids; - - if (assign_index) { - // search beam_size distances per query - FAISS_THROW_IF_NOT(assign_index->d == d); - cent_distances.resize(n * beam_size * new_beam_size); - cent_ids.resize(n * beam_size * new_beam_size); - if (assign_index->ntotal != 0) { - // then we assume the codebooks are already added to the index - FAISS_THROW_IF_NOT(assign_index->ntotal == K); - } else { - assign_index->add(K, cent); - } - - // printf("beam_search_encode_step -- mem usage %zd\n", - // get_mem_usage_kb()); - assign_index->search( - n * beam_size, - residuals, - new_beam_size, - cent_distances.data(), - cent_ids.data()); - } else { - // do one big distance computation - cent_distances.resize(n * beam_size * K); - pairwise_L2sqr( - d, n * beam_size, residuals, K, cent, cent_distances.data()); - } - InterruptCallback::check(); - -#pragma omp parallel for if (n > 100) - for (int64_t i = 0; i < n; i++) { - const int32_t* codes_i = codes + i * m * beam_size; - int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; - const float* residuals_i = residuals + i * d * beam_size; - float* new_residuals_i = new_residuals + i * d * new_beam_size; - - float* new_distances_i = new_distances + i * new_beam_size; - using C = CMax; - - if (assign_index) { - const float* cent_distances_i = - cent_distances.data() + i * beam_size * new_beam_size; - const idx_t* cent_ids_i = - cent_ids.data() + i * beam_size * new_beam_size; - - // here we could be a tad more efficient by merging sorted arrays - for (int i = 0; i < new_beam_size; i++) { - new_distances_i[i] = C::neutral(); - } - std::vector perm(new_beam_size, -1); - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * new_beam_size); - heap_reorder(new_beam_size, new_distances_i, perm.data()); - - for (int j = 0; j < new_beam_size; j++) { - int js = perm[j] / new_beam_size; - int ls = cent_ids_i[perm[j]]; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); - } - new_codes_i[m] = ls; - new_codes_i += m + 1; - fvec_sub( - d, - residuals_i + js * d, - cent + ls * d, - new_residuals_i); - new_residuals_i += d; - } - - } else { - const float* cent_distances_i = - cent_distances.data() + i * beam_size * K; - // then we have to select the best results - for (int i = 0; i < new_beam_size; i++) { - new_distances_i[i] = C::neutral(); - } - std::vector perm(new_beam_size, -1); - -#define HANDLE_APPROX(NB, BD) \ - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ - HeapWithBuckets::bs_addn( \ - beam_size, \ - K, \ - cent_distances_i, \ - new_beam_size, \ - new_distances_i, \ - perm.data()); \ - break; - - switch (approx_topk_mode) { - HANDLE_APPROX(8, 3) - HANDLE_APPROX(8, 2) - HANDLE_APPROX(16, 2) - HANDLE_APPROX(32, 2) - default: - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * K); - } - heap_reorder(new_beam_size, new_distances_i, perm.data()); - -#undef HANDLE_APPROX - - for (int j = 0; j < new_beam_size; j++) { - int js = perm[j] / K; - int ls = perm[j] % K; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); - } - new_codes_i[m] = ls; - new_codes_i += m + 1; - fvec_sub( - d, - residuals_i + js * d, - cent + ls * d, - new_residuals_i); - new_residuals_i += d; - } - } - } -} +/**************************************************************** + * Training + ****************************************************************/ void ResidualQuantizer::train(size_t n, const float* x) { codebooks.resize(d * codebook_offsets.back()); @@ -568,180 +415,11 @@ size_t ResidualQuantizer::memory_per_point(int beam_size) const { return mem; } -// a namespace full of preallocated buffers -namespace { - -// Preallocated memory chunk for refine_beam_mp() call -struct RefineBeamMemoryPool { - std::vector new_codes; - std::vector new_residuals; +/**************************************************************** + * Encoding + ****************************************************************/ - std::vector residuals; - std::vector codes; - std::vector distances; -}; - -// Preallocated memory chunk for refine_beam_LUT_mp() call -struct RefineBeamLUTMemoryPool { - std::vector new_codes; - std::vector new_distances; - - std::vector codes; - std::vector distances; -}; - -// this is for use_beam_LUT == 0 in compute_codes_add_centroids_mp_lut0() call -struct ComputeCodesAddCentroidsLUT0MemoryPool { - std::vector codes; - std::vector norms; - std::vector distances; - std::vector residuals; - RefineBeamMemoryPool refine_beam_pool; -}; - -// this is for use_beam_LUT == 1 in compute_codes_add_centroids_mp_lut1() call -struct ComputeCodesAddCentroidsLUT1MemoryPool { - std::vector codes; - std::vector distances; - std::vector query_norms; - std::vector query_cp; - std::vector residuals; - RefineBeamLUTMemoryPool refine_beam_lut_pool; -}; - -} // namespace - -// forward declaration -void refine_beam_mp( - const ResidualQuantizer& rq, - size_t n, - size_t beam_size, - const float* x, - int out_beam_size, - int32_t* out_codes, - float* out_residuals, - float* out_distances, - RefineBeamMemoryPool& pool); - -// forward declaration -void refine_beam_LUT_mp( - const ResidualQuantizer& rq, - size_t n, - const float* query_norms, // size n - const float* query_cp, // - int out_beam_size, - int32_t* out_codes, - float* out_distances, - RefineBeamLUTMemoryPool& pool); - -// this is for use_beam_LUT == 0 -void compute_codes_add_centroids_mp_lut0( - const ResidualQuantizer& rq, - const float* x, - uint8_t* codes_out, - size_t n, - const float* centroids, - ComputeCodesAddCentroidsLUT0MemoryPool& pool) { - pool.codes.resize(rq.max_beam_size * rq.M * n); - pool.distances.resize(rq.max_beam_size * n); - - pool.residuals.resize(rq.max_beam_size * n * rq.d); - - refine_beam_mp( - rq, - n, - 1, - x, - rq.max_beam_size, - pool.codes.data(), - pool.residuals.data(), - pool.distances.data(), - pool.refine_beam_pool); - - if (rq.search_type == ResidualQuantizer::ST_norm_float || - rq.search_type == ResidualQuantizer::ST_norm_qint8 || - rq.search_type == ResidualQuantizer::ST_norm_qint4) { - pool.norms.resize(n); - // recover the norms of reconstruction as - // || original_vector - residual ||^2 - for (size_t i = 0; i < n; i++) { - pool.norms[i] = fvec_L2sqr( - x + i * rq.d, - pool.residuals.data() + i * rq.max_beam_size * rq.d, - rq.d); - } - } - - // pack only the first code of the beam - // (hence the ld_codes=M * max_beam_size) - rq.pack_codes( - n, - pool.codes.data(), - codes_out, - rq.M * rq.max_beam_size, - (pool.norms.size() > 0) ? pool.norms.data() : nullptr, - centroids); -} - -// use_beam_LUT == 1 -void compute_codes_add_centroids_mp_lut1( - const ResidualQuantizer& rq, - const float* x, - uint8_t* codes_out, - size_t n, - const float* centroids, - ComputeCodesAddCentroidsLUT1MemoryPool& pool) { - // - pool.codes.resize(rq.max_beam_size * rq.M * n); - pool.distances.resize(rq.max_beam_size * n); - - FAISS_THROW_IF_NOT_MSG( - rq.codebook_cross_products.size() == - rq.total_codebook_size * rq.total_codebook_size, - "call compute_codebook_tables first"); - - pool.query_norms.resize(n); - fvec_norms_L2sqr(pool.query_norms.data(), x, rq.d, n); - - pool.query_cp.resize(n * rq.total_codebook_size); - { - FINTEGER ti = rq.total_codebook_size, di = rq.d, ni = n; - float zero = 0, one = 1; - sgemm_("Transposed", - "Not transposed", - &ti, - &ni, - &di, - &one, - rq.codebooks.data(), - &di, - x, - &di, - &zero, - pool.query_cp.data(), - &ti); - } - - refine_beam_LUT_mp( - rq, - n, - pool.query_norms.data(), - pool.query_cp.data(), - rq.max_beam_size, - pool.codes.data(), - pool.distances.data(), - pool.refine_beam_lut_pool); - - // pack only the first code of the beam - // (hence the ld_codes=M * max_beam_size) - rq.pack_codes( - n, - pool.codes.data(), - codes_out, - rq.M * rq.max_beam_size, - nullptr, - centroids); -} +using namespace rq_encode_steps; void ResidualQuantizer::compute_codes_add_centroids( const float* x, @@ -769,11 +447,6 @@ void ResidualQuantizer::compute_codes_add_centroids( cent = centroids + i0 * d; } - // compute_codes_add_centroids( - // x + i0 * d, - // codes_out + i0 * code_size, - // i1 - i0, - // cent); if (use_beam_LUT == 0) { compute_codes_add_centroids_mp_lut0( *this, @@ -794,147 +467,6 @@ void ResidualQuantizer::compute_codes_add_centroids( } } -void refine_beam_mp( - const ResidualQuantizer& rq, - size_t n, - size_t beam_size, - const float* x, - int out_beam_size, - int32_t* out_codes, - float* out_residuals, - float* out_distances, - RefineBeamMemoryPool& pool) { - int cur_beam_size = beam_size; - - double t0 = getmillisecs(); - - // find the max_beam_size - int max_beam_size = 0; - { - int tmp_beam_size = cur_beam_size; - for (int m = 0; m < rq.M; m++) { - int K = 1 << rq.nbits[m]; - int new_beam_size = std::min(tmp_beam_size * K, out_beam_size); - tmp_beam_size = new_beam_size; - - if (max_beam_size < new_beam_size) { - max_beam_size = new_beam_size; - } - } - } - - // preallocate buffers - pool.new_codes.resize(n * max_beam_size * (rq.M + 1)); - pool.new_residuals.resize(n * max_beam_size * rq.d); - - pool.codes.resize(n * max_beam_size * (rq.M + 1)); - pool.distances.resize(n * max_beam_size); - pool.residuals.resize(n * rq.d * max_beam_size); - - for (size_t i = 0; i < n * rq.d * beam_size; i++) { - pool.residuals[i] = x[i]; - } - - // set up pointers to buffers - int32_t* __restrict codes_ptr = pool.codes.data(); - float* __restrict residuals_ptr = pool.residuals.data(); - - int32_t* __restrict new_codes_ptr = pool.new_codes.data(); - float* __restrict new_residuals_ptr = pool.new_residuals.data(); - - // index - std::unique_ptr assign_index; - if (rq.assign_index_factory) { - assign_index.reset((*rq.assign_index_factory)(rq.d)); - } else { - assign_index.reset(new IndexFlatL2(rq.d)); - } - - // main loop - size_t codes_size = 0; - size_t distances_size = 0; - size_t residuals_size = 0; - - for (int m = 0; m < rq.M; m++) { - int K = 1 << rq.nbits[m]; - - const float* __restrict codebooks_m = - rq.codebooks.data() + rq.codebook_offsets[m] * rq.d; - - const int new_beam_size = std::min(cur_beam_size * K, out_beam_size); - - codes_size = n * new_beam_size * (m + 1); - residuals_size = n * new_beam_size * rq.d; - distances_size = n * new_beam_size; - - beam_search_encode_step( - rq.d, - K, - codebooks_m, - n, - cur_beam_size, - // residuals.data(), - residuals_ptr, - m, - // codes.data(), - codes_ptr, - new_beam_size, - // new_codes.data(), - new_codes_ptr, - // new_residuals.data(), - new_residuals_ptr, - pool.distances.data(), - assign_index.get(), - rq.approx_topk_mode); - - assign_index->reset(); - - std::swap(codes_ptr, new_codes_ptr); - std::swap(residuals_ptr, new_residuals_ptr); - - cur_beam_size = new_beam_size; - - if (rq.verbose) { - float sum_distances = 0; - // for (int j = 0; j < distances.size(); j++) { - // sum_distances += distances[j]; - // } - for (int j = 0; j < distances_size; j++) { - sum_distances += pool.distances[j]; - } - - printf("[%.3f s] encode stage %d, %d bits, " - "total error %g, beam_size %d\n", - (getmillisecs() - t0) / 1000, - m, - int(rq.nbits[m]), - sum_distances, - cur_beam_size); - } - } - - if (out_codes) { - // memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0])); - memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr)); - } - if (out_residuals) { - // memcpy(out_residuals, - // residuals.data(), - // residuals.size() * sizeof(residuals[0])); - memcpy(out_residuals, - residuals_ptr, - residuals_size * sizeof(*residuals_ptr)); - } - if (out_distances) { - // memcpy(out_distances, - // distances.data(), - // distances.size() * sizeof(distances[0])); - memcpy(out_distances, - pool.distances.data(), - distances_size * sizeof(pool.distances[0])); - } -} - void ResidualQuantizer::refine_beam( size_t n, size_t beam_size, @@ -987,533 +519,6 @@ void ResidualQuantizer::compute_codebook_tables() { } } -namespace { - -template -void accum_and_store_tab( - const size_t m_offset, - const float* const __restrict codebook_cross_norms, - const uint64_t* const __restrict codebook_offsets, - const int32_t* const __restrict codes_i, - const size_t b, - const size_t ldc, - const size_t K, - float* const __restrict output) { - // load pointers into registers - const float* cbs[M]; - for (size_t ij = 0; ij < M; ij++) { - const size_t code = static_cast(codes_i[b * m_offset + ij]); - cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; - } - - // do accumulation in registers using SIMD. - // It is possible that compiler may be smart enough so that - // this manual SIMD unrolling might be unneeded. -#if defined(__AVX2__) || defined(__aarch64__) - const size_t K8 = (K / (8 * NK)) * (8 * NK); - - // process in chunks of size (8 * NK) floats - for (size_t kk = 0; kk < K8; kk += 8 * NK) { - simd8float32 regs[NK]; - for (size_t ik = 0; ik < NK; ik++) { - regs[ik].loadu(cbs[0] + kk + ik * 8); - } - - for (size_t ij = 1; ij < M; ij++) { - for (size_t ik = 0; ik < NK; ik++) { - regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); - } - } - - // write the result - for (size_t ik = 0; ik < NK; ik++) { - regs[ik].storeu(output + kk + ik * 8); - } - } -#else - const size_t K8 = 0; -#endif - - // process leftovers - for (size_t kk = K8; kk < K; kk++) { - float reg = cbs[0][kk]; - for (size_t ij = 1; ij < M; ij++) { - reg += cbs[ij][kk]; - } - output[b * K + kk] = reg; - } -} - -template -void accum_and_add_tab( - const size_t m_offset, - const float* const __restrict codebook_cross_norms, - const uint64_t* const __restrict codebook_offsets, - const int32_t* const __restrict codes_i, - const size_t b, - const size_t ldc, - const size_t K, - float* const __restrict output) { - // load pointers into registers - const float* cbs[M]; - for (size_t ij = 0; ij < M; ij++) { - const size_t code = static_cast(codes_i[b * m_offset + ij]); - cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; - } - - // do accumulation in registers using SIMD. - // It is possible that compiler may be smart enough so that - // this manual SIMD unrolling might be unneeded. -#if defined(__AVX2__) || defined(__aarch64__) - const size_t K8 = (K / (8 * NK)) * (8 * NK); - - // process in chunks of size (8 * NK) floats - for (size_t kk = 0; kk < K8; kk += 8 * NK) { - simd8float32 regs[NK]; - for (size_t ik = 0; ik < NK; ik++) { - regs[ik].loadu(cbs[0] + kk + ik * 8); - } - - for (size_t ij = 1; ij < M; ij++) { - for (size_t ik = 0; ik < NK; ik++) { - regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); - } - } - - // write the result - for (size_t ik = 0; ik < NK; ik++) { - simd8float32 existing(output + kk + ik * 8); - existing += regs[ik]; - existing.storeu(output + kk + ik * 8); - } - } -#else - const size_t K8 = 0; -#endif - - // process leftovers - for (size_t kk = K8; kk < K; kk++) { - float reg = cbs[0][kk]; - for (size_t ij = 1; ij < M; ij++) { - reg += cbs[ij][kk]; - } - output[b * K + kk] += reg; - } -} - -template -void accum_and_finalize_tab( - const float* const __restrict codebook_cross_norms, - const uint64_t* const __restrict codebook_offsets, - const int32_t* const __restrict codes_i, - const size_t b, - const size_t ldc, - const size_t K, - const float* const __restrict distances_i, - const float* const __restrict cd_common, - float* const __restrict output) { - // load pointers into registers - const float* cbs[M]; - for (size_t ij = 0; ij < M; ij++) { - const size_t code = static_cast(codes_i[b * M + ij]); - cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; - } - - // do accumulation in registers using SIMD. - // It is possible that compiler may be smart enough so that - // this manual SIMD unrolling might be unneeded. -#if defined(__AVX2__) || defined(__aarch64__) - const size_t K8 = (K / (8 * NK)) * (8 * NK); - - // process in chunks of size (8 * NK) floats - for (size_t kk = 0; kk < K8; kk += 8 * NK) { - simd8float32 regs[NK]; - for (size_t ik = 0; ik < NK; ik++) { - regs[ik].loadu(cbs[0] + kk + ik * 8); - } - - for (size_t ij = 1; ij < M; ij++) { - for (size_t ik = 0; ik < NK; ik++) { - regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); - } - } - - simd8float32 two(2.0f); - for (size_t ik = 0; ik < NK; ik++) { - // cent_distances[b * K + k] = distances_i[b] + cd_common[k] - // + 2 * dp[k]; - - simd8float32 common_v(cd_common + kk + ik * 8); - common_v = fmadd(two, regs[ik], common_v); - - common_v += simd8float32(distances_i[b]); - common_v.storeu(output + b * K + kk + ik * 8); - } - } -#else - const size_t K8 = 0; -#endif - - // process leftovers - for (size_t kk = K8; kk < K; kk++) { - float reg = cbs[0][kk]; - for (size_t ij = 1; ij < M; ij++) { - reg += cbs[ij][kk]; - } - - output[b * K + kk] = distances_i[b] + cd_common[kk] + 2 * reg; - } -} - -} // namespace - -void beam_search_encode_step_tab( - size_t K, - size_t n, - size_t beam_size, // input sizes - const float* codebook_cross_norms, // size K * ldc - size_t ldc, // >= K - const uint64_t* codebook_offsets, // m - const float* query_cp, // size n * ldqc - size_t ldqc, // >= K - const float* cent_norms_i, // size K - size_t m, - const int32_t* codes, // n * beam_size * m - const float* distances, // n * beam_size - size_t new_beam_size, - int32_t* new_codes, // n * new_beam_size * (m + 1) - float* new_distances, // n * new_beam_size - ApproxTopK_mode_t approx_topk_mode) // -{ - FAISS_THROW_IF_NOT(ldc >= K); - -#pragma omp parallel for if (n > 100) schedule(dynamic) - for (int64_t i = 0; i < n; i++) { - std::vector cent_distances(beam_size * K); - std::vector cd_common(K); - - const int32_t* codes_i = codes + i * m * beam_size; - const float* query_cp_i = query_cp + i * ldqc; - const float* distances_i = distances + i * beam_size; - - for (size_t k = 0; k < K; k++) { - cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k]; - } - - /* - // This is the baseline implementation. Its primary flaw - // that it writes way too many info to the temporary buffer - // called dp. - // - // This baseline code is kept intentionally because it is easy to - // understand what an optimized version optimizes exactly. - // - for (size_t b = 0; b < beam_size; b++) { - std::vector dp(K); - - for (size_t m1 = 0; m1 < m; m1++) { - size_t c = codes_i[b * m + m1]; - const float* cb = - &codebook_cross_norms[(codebook_offsets[m1] + c) * ldc]; - fvec_add(K, cb, dp.data(), dp.data()); - } - - for (size_t k = 0; k < K; k++) { - cent_distances[b * K + k] = - distances_i[b] + cd_common[k] + 2 * dp[k]; - } - } - */ - - // An optimized implementation that avoids using a temporary buffer - // and does the accumulation in registers. - - // Compute a sum of NK AQ codes. -#define ACCUM_AND_FINALIZE_TAB(NK) \ - case NK: \ - for (size_t b = 0; b < beam_size; b++) { \ - accum_and_finalize_tab( \ - codebook_cross_norms, \ - codebook_offsets, \ - codes_i, \ - b, \ - ldc, \ - K, \ - distances_i, \ - cd_common.data(), \ - cent_distances.data()); \ - } \ - break; - - // this version contains many switch-case scenarios, but - // they won't affect branch predictor. - switch (m) { - case 0: - // trivial case - for (size_t b = 0; b < beam_size; b++) { - for (size_t k = 0; k < K; k++) { - cent_distances[b * K + k] = - distances_i[b] + cd_common[k]; - } - } - break; - - ACCUM_AND_FINALIZE_TAB(1) - ACCUM_AND_FINALIZE_TAB(2) - ACCUM_AND_FINALIZE_TAB(3) - ACCUM_AND_FINALIZE_TAB(4) - ACCUM_AND_FINALIZE_TAB(5) - ACCUM_AND_FINALIZE_TAB(6) - ACCUM_AND_FINALIZE_TAB(7) - - default: { - // m >= 8 case. - - // A temporary buffer has to be used due to the lack of - // registers. But we'll try to accumulate up to 8 AQ codes in - // registers and issue a single write operation to the buffer, - // while the baseline does no accumulation. So, the number of - // write operations to the temporary buffer is reduced 8x. - - // allocate a temporary buffer - std::vector dp(K); - - for (size_t b = 0; b < beam_size; b++) { - // Initialize it. Compute a sum of first 8 AQ codes - // because m >= 8 . - accum_and_store_tab<8, 4>( - m, - codebook_cross_norms, - codebook_offsets, - codes_i, - b, - ldc, - K, - dp.data()); - -#define ACCUM_AND_ADD_TAB(NK) \ - case NK: \ - accum_and_add_tab( \ - m, \ - codebook_cross_norms, \ - codebook_offsets + im, \ - codes_i + im, \ - b, \ - ldc, \ - K, \ - dp.data()); \ - break; - - // accumulate up to 8 additional AQ codes into - // a temporary buffer - for (size_t im = 8; im < ((m + 7) / 8) * 8; im += 8) { - size_t m_left = m - im; - if (m_left > 8) { - m_left = 8; - } - - switch (m_left) { - ACCUM_AND_ADD_TAB(1) - ACCUM_AND_ADD_TAB(2) - ACCUM_AND_ADD_TAB(3) - ACCUM_AND_ADD_TAB(4) - ACCUM_AND_ADD_TAB(5) - ACCUM_AND_ADD_TAB(6) - ACCUM_AND_ADD_TAB(7) - ACCUM_AND_ADD_TAB(8) - } - } - - // done. finalize the result - for (size_t k = 0; k < K; k++) { - cent_distances[b * K + k] = - distances_i[b] + cd_common[k] + 2 * dp[k]; - } - } - } - } - - // the optimized implementation ends here - - using C = CMax; - int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; - float* new_distances_i = new_distances + i * new_beam_size; - - const float* cent_distances_i = cent_distances.data(); - - // then we have to select the best results - for (int i = 0; i < new_beam_size; i++) { - new_distances_i[i] = C::neutral(); - } - std::vector perm(new_beam_size, -1); - -#define HANDLE_APPROX(NB, BD) \ - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ - HeapWithBuckets::bs_addn( \ - beam_size, \ - K, \ - cent_distances_i, \ - new_beam_size, \ - new_distances_i, \ - perm.data()); \ - break; - - switch (approx_topk_mode) { - HANDLE_APPROX(8, 3) - HANDLE_APPROX(8, 2) - HANDLE_APPROX(16, 2) - HANDLE_APPROX(32, 2) - default: - heap_addn( - new_beam_size, - new_distances_i, - perm.data(), - cent_distances_i, - nullptr, - beam_size * K); - break; - } - - heap_reorder(new_beam_size, new_distances_i, perm.data()); - -#undef HANDLE_APPROX - - for (int j = 0; j < new_beam_size; j++) { - int js = perm[j] / K; - int ls = perm[j] % K; - if (m > 0) { - memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); - } - new_codes_i[m] = ls; - new_codes_i += m + 1; - } - } -} - -// -void refine_beam_LUT_mp( - const ResidualQuantizer& rq, - size_t n, - const float* query_norms, // size n - const float* query_cp, // - int out_beam_size, - int32_t* out_codes, - float* out_distances, - RefineBeamLUTMemoryPool& pool) { - int beam_size = 1; - - double t0 = getmillisecs(); - - // find the max_beam_size - int max_beam_size = 0; - { - int tmp_beam_size = beam_size; - for (int m = 0; m < rq.M; m++) { - int K = 1 << rq.nbits[m]; - int new_beam_size = std::min(tmp_beam_size * K, out_beam_size); - tmp_beam_size = new_beam_size; - - if (max_beam_size < new_beam_size) { - max_beam_size = new_beam_size; - } - } - } - - // preallocate buffers - pool.new_codes.resize(n * max_beam_size * (rq.M + 1)); - pool.new_distances.resize(n * max_beam_size); - - pool.codes.resize(n * max_beam_size * (rq.M + 1)); - pool.distances.resize(n * max_beam_size); - - for (size_t i = 0; i < n; i++) { - pool.distances[i] = query_norms[i]; - } - - // set up pointers to buffers - int32_t* __restrict new_codes_ptr = pool.new_codes.data(); - float* __restrict new_distances_ptr = pool.new_distances.data(); - - int32_t* __restrict codes_ptr = pool.codes.data(); - float* __restrict distances_ptr = pool.distances.data(); - - // main loop - size_t codes_size = 0; - size_t distances_size = 0; - for (int m = 0; m < rq.M; m++) { - int K = 1 << rq.nbits[m]; - - // it is guaranteed that (new_beam_size <= than max_beam_size) == - // true - int new_beam_size = std::min(beam_size * K, out_beam_size); - - // std::vector new_codes(n * new_beam_size * (m + 1)); - // std::vector new_distances(n * new_beam_size); - - codes_size = n * new_beam_size * (m + 1); - distances_size = n * new_beam_size; - - beam_search_encode_step_tab( - K, - n, - beam_size, - rq.codebook_cross_products.data() + rq.codebook_offsets[m], - rq.total_codebook_size, - rq.codebook_offsets.data(), - query_cp + rq.codebook_offsets[m], - rq.total_codebook_size, - rq.cent_norms.data() + rq.codebook_offsets[m], - m, - // codes.data(), - codes_ptr, - // distances.data(), - distances_ptr, - new_beam_size, - // new_codes.data(), - new_codes_ptr, - // new_distances.data() - new_distances_ptr, - rq.approx_topk_mode); - - // codes.swap(new_codes); - std::swap(codes_ptr, new_codes_ptr); - // distances.swap(new_distances); - std::swap(distances_ptr, new_distances_ptr); - - beam_size = new_beam_size; - - if (rq.verbose) { - float sum_distances = 0; - // for (int j = 0; j < distances.size(); j++) { - // sum_distances += distances[j]; - // } - for (int j = 0; j < distances_size; j++) { - sum_distances += distances_ptr[j]; - } - printf("[%.3f s] encode stage %d, %d bits, " - "total error %g, beam_size %d\n", - (getmillisecs() - t0) / 1000, - m, - int(rq.nbits[m]), - sum_distances, - beam_size); - } - } - - if (out_codes) { - // memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0])); - memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr)); - } - if (out_distances) { - // memcpy(out_distances, - // distances.data(), - // distances.size() * sizeof(distances[0])); - memcpy(out_distances, - distances_ptr, - distances_size * sizeof(*distances_ptr)); - } -} - void ResidualQuantizer::refine_beam_LUT( size_t n, const float* query_norms, // size n diff --git a/faiss/impl/ResidualQuantizer.h b/faiss/impl/ResidualQuantizer.h index 042f96d232..b439841d10 100644 --- a/faiss/impl/ResidualQuantizer.h +++ b/faiss/impl/ResidualQuantizer.h @@ -144,9 +144,7 @@ struct ResidualQuantizer : AdditiveQuantizer { */ size_t memory_per_point(int beam_size = -1) const; - /** Cross products used in codebook tables - * - * These are used to keep trak of norms of centroids. + /** Cross products used in codebook tables used for beam_LUT = 1 */ void compute_codebook_tables(); @@ -157,60 +155,4 @@ struct ResidualQuantizer : AdditiveQuantizer { std::vector cent_norms; }; -/** Encode a residual by sampling from a centroid table. - * - * This is a single encoding step the residual quantizer. - * It allows low-level access to the encoding function, exposed mainly for unit - * tests. - * - * @param n number of vectors to hanlde - * @param residuals vectors to encode, size (n, beam_size, d) - * @param cent centroids, size (K, d) - * @param beam_size input beam size - * @param m size of the codes for the previous encoding steps - * @param codes code array for the previous steps of the beam (n, - * beam_size, m) - * @param new_beam_size output beam size (should be <= K * beam_size) - * @param new_codes output codes, size (n, new_beam_size, m + 1) - * @param new_residuals output residuals, size (n, new_beam_size, d) - * @param new_distances output distances, size (n, new_beam_size) - * @param assign_index if non-NULL, will be used to perform assignment - */ -void beam_search_encode_step( - size_t d, - size_t K, - const float* cent, - size_t n, - size_t beam_size, - const float* residuals, - size_t m, - const int32_t* codes, - size_t new_beam_size, - int32_t* new_codes, - float* new_residuals, - float* new_distances, - Index* assign_index = nullptr, - ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK); - -/** Encode a set of vectors using their dot products with the codebooks - * - */ -void beam_search_encode_step_tab( - size_t K, - size_t n, - size_t beam_size, // input sizes - const float* codebook_cross_norms, // size K * ldc - size_t ldc, // >= K - const uint64_t* codebook_offsets, // m - const float* query_cp, // size n * ldqc - size_t ldqc, // >= K - const float* cent_norms_i, // size K - size_t m, - const int32_t* codes, // n * beam_size * m - const float* distances, // n * beam_size - size_t new_beam_size, - int32_t* new_codes, // n * new_beam_size * (m + 1) - float* new_distances, // n * new_beam_size - ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK); - }; // namespace faiss diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 423b22a9cc..4802e60ede 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -292,11 +292,17 @@ static void read_AdditiveQuantizer(AdditiveQuantizer* aq, IOReader* f) { aq->set_derived_values(); } -static void read_ResidualQuantizer(ResidualQuantizer* rq, IOReader* f) { +static void read_ResidualQuantizer( + ResidualQuantizer* rq, + IOReader* f, + int io_flags) { read_AdditiveQuantizer(rq, f); READ1(rq->train_type); READ1(rq->max_beam_size); - if (!(rq->train_type & ResidualQuantizer::Skip_codebook_tables)) { + if ((rq->train_type & ResidualQuantizer::Skip_codebook_tables) || + (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE)) { + // don't precompute the tables + } else { rq->compute_codebook_tables(); } } @@ -325,12 +331,13 @@ static void read_ProductAdditiveQuantizer( static void read_ProductResidualQuantizer( ProductResidualQuantizer* prq, - IOReader* f) { + IOReader* f, + int io_flags) { read_ProductAdditiveQuantizer(prq, f); for (size_t i = 0; i < prq->nsplits; i++) { auto rq = new ResidualQuantizer(); - read_ResidualQuantizer(rq, f); + read_ResidualQuantizer(rq, f, io_flags); prq->quantizers.push_back(rq); } } @@ -601,7 +608,7 @@ Index* read_index(IOReader* f, int io_flags) { if (h == fourcc("IxRQ")) { read_ResidualQuantizer_old(&idxr->rq, f); } else { - read_ResidualQuantizer(&idxr->rq, f); + read_ResidualQuantizer(&idxr->rq, f, io_flags); } READ1(idxr->code_size); READVECTOR(idxr->codes); @@ -616,7 +623,7 @@ Index* read_index(IOReader* f, int io_flags) { } else if (h == fourcc("IxPR")) { auto idxpr = new IndexProductResidualQuantizer(); read_index_header(idxpr, f); - read_ProductResidualQuantizer(&idxpr->prq, f); + read_ProductResidualQuantizer(&idxpr->prq, f, io_flags); READ1(idxpr->code_size); READVECTOR(idxpr->codes); idx = idxpr; @@ -630,8 +637,13 @@ Index* read_index(IOReader* f, int io_flags) { } else if (h == fourcc("ImRQ")) { ResidualCoarseQuantizer* idxr = new ResidualCoarseQuantizer(); read_index_header(idxr, f); - read_ResidualQuantizer(&idxr->rq, f); + read_ResidualQuantizer(&idxr->rq, f, io_flags); READ1(idxr->beam_factor); + if (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE) { + // then we force the beam factor to -1 + // which skips the table precomputation. + idxr->beam_factor = -1; + } idxr->set_beam_factor(idxr->beam_factor); idx = idxr; } else if ( @@ -656,13 +668,14 @@ Index* read_index(IOReader* f, int io_flags) { if (is_LSQ) { read_LocalSearchQuantizer((LocalSearchQuantizer*)idxaqfs->aq, f); } else if (is_RQ) { - read_ResidualQuantizer((ResidualQuantizer*)idxaqfs->aq, f); + read_ResidualQuantizer( + (ResidualQuantizer*)idxaqfs->aq, f, io_flags); } else if (is_PLSQ) { read_ProductLocalSearchQuantizer( (ProductLocalSearchQuantizer*)idxaqfs->aq, f); } else { read_ProductResidualQuantizer( - (ProductResidualQuantizer*)idxaqfs->aq, f); + (ProductResidualQuantizer*)idxaqfs->aq, f, io_flags); } READ1(idxaqfs->implem); @@ -704,13 +717,13 @@ Index* read_index(IOReader* f, int io_flags) { if (is_LSQ) { read_LocalSearchQuantizer((LocalSearchQuantizer*)ivaqfs->aq, f); } else if (is_RQ) { - read_ResidualQuantizer((ResidualQuantizer*)ivaqfs->aq, f); + read_ResidualQuantizer((ResidualQuantizer*)ivaqfs->aq, f, io_flags); } else if (is_PLSQ) { read_ProductLocalSearchQuantizer( (ProductLocalSearchQuantizer*)ivaqfs->aq, f); } else { read_ProductResidualQuantizer( - (ProductResidualQuantizer*)ivaqfs->aq, f); + (ProductResidualQuantizer*)ivaqfs->aq, f, io_flags); } READ1(ivaqfs->by_residual); @@ -832,13 +845,13 @@ Index* read_index(IOReader* f, int io_flags) { if (is_LSQ) { read_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f); } else if (is_RQ) { - read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f); + read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f, io_flags); } else if (is_PLSQ) { read_ProductLocalSearchQuantizer( (ProductLocalSearchQuantizer*)iva->aq, f); } else { read_ProductResidualQuantizer( - (ProductResidualQuantizer*)iva->aq, f); + (ProductResidualQuantizer*)iva->aq, f, io_flags); } READ1(iva->by_residual); READ1(iva->use_precomputed_table); diff --git a/faiss/impl/residual_quantizer_encode_steps.cpp b/faiss/impl/residual_quantizer_encode_steps.cpp new file mode 100644 index 0000000000..f66285d9a5 --- /dev/null +++ b/faiss/impl/residual_quantizer_encode_steps.cpp @@ -0,0 +1,955 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +extern "C" { + +// general matrix multiplication +int sgemm_( + const char* transa, + const char* transb, + FINTEGER* m, + FINTEGER* n, + FINTEGER* k, + const float* alpha, + const float* a, + FINTEGER* lda, + const float* b, + FINTEGER* ldb, + float* beta, + float* c, + FINTEGER* ldc); +} + +namespace faiss { + +/******************************************************************** + * Basic routines + ********************************************************************/ + +namespace { + +template +void accum_and_store_tab( + const size_t m_offset, + const float* const __restrict codebook_cross_norms, + const uint64_t* const __restrict codebook_offsets, + const int32_t* const __restrict codes_i, + const size_t b, + const size_t ldc, + const size_t K, + float* const __restrict output) { + // load pointers into registers + const float* cbs[M]; + for (size_t ij = 0; ij < M; ij++) { + const size_t code = static_cast(codes_i[b * m_offset + ij]); + cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; + } + + // do accumulation in registers using SIMD. + // It is possible that compiler may be smart enough so that + // this manual SIMD unrolling might be unneeded. +#if defined(__AVX2__) || defined(__aarch64__) + const size_t K8 = (K / (8 * NK)) * (8 * NK); + + // process in chunks of size (8 * NK) floats + for (size_t kk = 0; kk < K8; kk += 8 * NK) { + simd8float32 regs[NK]; + for (size_t ik = 0; ik < NK; ik++) { + regs[ik].loadu(cbs[0] + kk + ik * 8); + } + + for (size_t ij = 1; ij < M; ij++) { + for (size_t ik = 0; ik < NK; ik++) { + regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); + } + } + + // write the result + for (size_t ik = 0; ik < NK; ik++) { + regs[ik].storeu(output + kk + ik * 8); + } + } +#else + const size_t K8 = 0; +#endif + + // process leftovers + for (size_t kk = K8; kk < K; kk++) { + float reg = cbs[0][kk]; + for (size_t ij = 1; ij < M; ij++) { + reg += cbs[ij][kk]; + } + output[b * K + kk] = reg; + } +} + +template +void accum_and_add_tab( + const size_t m_offset, + const float* const __restrict codebook_cross_norms, + const uint64_t* const __restrict codebook_offsets, + const int32_t* const __restrict codes_i, + const size_t b, + const size_t ldc, + const size_t K, + float* const __restrict output) { + // load pointers into registers + const float* cbs[M]; + for (size_t ij = 0; ij < M; ij++) { + const size_t code = static_cast(codes_i[b * m_offset + ij]); + cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; + } + + // do accumulation in registers using SIMD. + // It is possible that compiler may be smart enough so that + // this manual SIMD unrolling might be unneeded. +#if defined(__AVX2__) || defined(__aarch64__) + const size_t K8 = (K / (8 * NK)) * (8 * NK); + + // process in chunks of size (8 * NK) floats + for (size_t kk = 0; kk < K8; kk += 8 * NK) { + simd8float32 regs[NK]; + for (size_t ik = 0; ik < NK; ik++) { + regs[ik].loadu(cbs[0] + kk + ik * 8); + } + + for (size_t ij = 1; ij < M; ij++) { + for (size_t ik = 0; ik < NK; ik++) { + regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); + } + } + + // write the result + for (size_t ik = 0; ik < NK; ik++) { + simd8float32 existing(output + kk + ik * 8); + existing += regs[ik]; + existing.storeu(output + kk + ik * 8); + } + } +#else + const size_t K8 = 0; +#endif + + // process leftovers + for (size_t kk = K8; kk < K; kk++) { + float reg = cbs[0][kk]; + for (size_t ij = 1; ij < M; ij++) { + reg += cbs[ij][kk]; + } + output[b * K + kk] += reg; + } +} + +template +void accum_and_finalize_tab( + const float* const __restrict codebook_cross_norms, + const uint64_t* const __restrict codebook_offsets, + const int32_t* const __restrict codes_i, + const size_t b, + const size_t ldc, + const size_t K, + const float* const __restrict distances_i, + const float* const __restrict cd_common, + float* const __restrict output) { + // load pointers into registers + const float* cbs[M]; + for (size_t ij = 0; ij < M; ij++) { + const size_t code = static_cast(codes_i[b * M + ij]); + cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc]; + } + + // do accumulation in registers using SIMD. + // It is possible that compiler may be smart enough so that + // this manual SIMD unrolling might be unneeded. +#if defined(__AVX2__) || defined(__aarch64__) + const size_t K8 = (K / (8 * NK)) * (8 * NK); + + // process in chunks of size (8 * NK) floats + for (size_t kk = 0; kk < K8; kk += 8 * NK) { + simd8float32 regs[NK]; + for (size_t ik = 0; ik < NK; ik++) { + regs[ik].loadu(cbs[0] + kk + ik * 8); + } + + for (size_t ij = 1; ij < M; ij++) { + for (size_t ik = 0; ik < NK; ik++) { + regs[ik] += simd8float32(cbs[ij] + kk + ik * 8); + } + } + + simd8float32 two(2.0f); + for (size_t ik = 0; ik < NK; ik++) { + // cent_distances[b * K + k] = distances_i[b] + cd_common[k] + // + 2 * dp[k]; + + simd8float32 common_v(cd_common + kk + ik * 8); + common_v = fmadd(two, regs[ik], common_v); + + common_v += simd8float32(distances_i[b]); + common_v.storeu(output + b * K + kk + ik * 8); + } + } +#else + const size_t K8 = 0; +#endif + + // process leftovers + for (size_t kk = K8; kk < K; kk++) { + float reg = cbs[0][kk]; + for (size_t ij = 1; ij < M; ij++) { + reg += cbs[ij][kk]; + } + + output[b * K + kk] = distances_i[b] + cd_common[kk] + 2 * reg; + } +} + +} // anonymous namespace + +/******************************************************************** + * Single encoding step + ********************************************************************/ + +void beam_search_encode_step( + size_t d, + size_t K, + const float* cent, /// size (K, d) + size_t n, + size_t beam_size, + const float* residuals, /// size (n, beam_size, d) + size_t m, + const int32_t* codes, /// size (n, beam_size, m) + size_t new_beam_size, + int32_t* new_codes, /// size (n, new_beam_size, m + 1) + float* new_residuals, /// size (n, new_beam_size, d) + float* new_distances, /// size (n, new_beam_size) + Index* assign_index, + ApproxTopK_mode_t approx_topk_mode) { + // we have to fill in the whole output matrix + FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K); + + std::vector cent_distances; + std::vector cent_ids; + + if (assign_index) { + // search beam_size distances per query + FAISS_THROW_IF_NOT(assign_index->d == d); + cent_distances.resize(n * beam_size * new_beam_size); + cent_ids.resize(n * beam_size * new_beam_size); + if (assign_index->ntotal != 0) { + // then we assume the codebooks are already added to the index + FAISS_THROW_IF_NOT(assign_index->ntotal == K); + } else { + assign_index->add(K, cent); + } + + // printf("beam_search_encode_step -- mem usage %zd\n", + // get_mem_usage_kb()); + assign_index->search( + n * beam_size, + residuals, + new_beam_size, + cent_distances.data(), + cent_ids.data()); + } else { + // do one big distance computation + cent_distances.resize(n * beam_size * K); + pairwise_L2sqr( + d, n * beam_size, residuals, K, cent, cent_distances.data()); + } + InterruptCallback::check(); + +#pragma omp parallel for if (n > 100) + for (int64_t i = 0; i < n; i++) { + const int32_t* codes_i = codes + i * m * beam_size; + int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; + const float* residuals_i = residuals + i * d * beam_size; + float* new_residuals_i = new_residuals + i * d * new_beam_size; + + float* new_distances_i = new_distances + i * new_beam_size; + using C = CMax; + + if (assign_index) { + const float* cent_distances_i = + cent_distances.data() + i * beam_size * new_beam_size; + const idx_t* cent_ids_i = + cent_ids.data() + i * beam_size * new_beam_size; + + // here we could be a tad more efficient by merging sorted arrays + for (int i = 0; i < new_beam_size; i++) { + new_distances_i[i] = C::neutral(); + } + std::vector perm(new_beam_size, -1); + heap_addn( + new_beam_size, + new_distances_i, + perm.data(), + cent_distances_i, + nullptr, + beam_size * new_beam_size); + heap_reorder(new_beam_size, new_distances_i, perm.data()); + + for (int j = 0; j < new_beam_size; j++) { + int js = perm[j] / new_beam_size; + int ls = cent_ids_i[perm[j]]; + if (m > 0) { + memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; + fvec_sub( + d, + residuals_i + js * d, + cent + ls * d, + new_residuals_i); + new_residuals_i += d; + } + + } else { + const float* cent_distances_i = + cent_distances.data() + i * beam_size * K; + // then we have to select the best results + for (int i = 0; i < new_beam_size; i++) { + new_distances_i[i] = C::neutral(); + } + std::vector perm(new_beam_size, -1); + +#define HANDLE_APPROX(NB, BD) \ + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ + HeapWithBuckets::bs_addn( \ + beam_size, \ + K, \ + cent_distances_i, \ + new_beam_size, \ + new_distances_i, \ + perm.data()); \ + break; + + switch (approx_topk_mode) { + HANDLE_APPROX(8, 3) + HANDLE_APPROX(8, 2) + HANDLE_APPROX(16, 2) + HANDLE_APPROX(32, 2) + default: + heap_addn( + new_beam_size, + new_distances_i, + perm.data(), + cent_distances_i, + nullptr, + beam_size * K); + } + heap_reorder(new_beam_size, new_distances_i, perm.data()); + +#undef HANDLE_APPROX + + for (int j = 0; j < new_beam_size; j++) { + int js = perm[j] / K; + int ls = perm[j] % K; + if (m > 0) { + memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; + fvec_sub( + d, + residuals_i + js * d, + cent + ls * d, + new_residuals_i); + new_residuals_i += d; + } + } + } +} + +// exposed in the faiss namespace +void beam_search_encode_step_tab( + size_t K, + size_t n, + size_t beam_size, // input sizes + const float* codebook_cross_norms, // size K * ldc + size_t ldc, // >= K + const uint64_t* codebook_offsets, // m + const float* query_cp, // size n * ldqc + size_t ldqc, // >= K + const float* cent_norms_i, // size K + size_t m, + const int32_t* codes, // n * beam_size * m + const float* distances, // n * beam_size + size_t new_beam_size, + int32_t* new_codes, // n * new_beam_size * (m + 1) + float* new_distances, // n * new_beam_size + ApproxTopK_mode_t approx_topk_mode) // +{ + FAISS_THROW_IF_NOT(ldc >= K); + +#pragma omp parallel for if (n > 100) schedule(dynamic) + for (int64_t i = 0; i < n; i++) { + std::vector cent_distances(beam_size * K); + std::vector cd_common(K); + + const int32_t* codes_i = codes + i * m * beam_size; + const float* query_cp_i = query_cp + i * ldqc; + const float* distances_i = distances + i * beam_size; + + for (size_t k = 0; k < K; k++) { + cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k]; + } + + /* + // This is the baseline implementation. Its primary flaw + // that it writes way too many info to the temporary buffer + // called dp. + // + // This baseline code is kept intentionally because it is easy to + // understand what an optimized version optimizes exactly. + // + for (size_t b = 0; b < beam_size; b++) { + std::vector dp(K); + + for (size_t m1 = 0; m1 < m; m1++) { + size_t c = codes_i[b * m + m1]; + const float* cb = + &codebook_cross_norms[(codebook_offsets[m1] + c) * ldc]; + fvec_add(K, cb, dp.data(), dp.data()); + } + + for (size_t k = 0; k < K; k++) { + cent_distances[b * K + k] = + distances_i[b] + cd_common[k] + 2 * dp[k]; + } + } + */ + + // An optimized implementation that avoids using a temporary buffer + // and does the accumulation in registers. + + // Compute a sum of NK AQ codes. +#define ACCUM_AND_FINALIZE_TAB(NK) \ + case NK: \ + for (size_t b = 0; b < beam_size; b++) { \ + accum_and_finalize_tab( \ + codebook_cross_norms, \ + codebook_offsets, \ + codes_i, \ + b, \ + ldc, \ + K, \ + distances_i, \ + cd_common.data(), \ + cent_distances.data()); \ + } \ + break; + + // this version contains many switch-case scenarios, but + // they won't affect branch predictor. + switch (m) { + case 0: + // trivial case + for (size_t b = 0; b < beam_size; b++) { + for (size_t k = 0; k < K; k++) { + cent_distances[b * K + k] = + distances_i[b] + cd_common[k]; + } + } + break; + + ACCUM_AND_FINALIZE_TAB(1) + ACCUM_AND_FINALIZE_TAB(2) + ACCUM_AND_FINALIZE_TAB(3) + ACCUM_AND_FINALIZE_TAB(4) + ACCUM_AND_FINALIZE_TAB(5) + ACCUM_AND_FINALIZE_TAB(6) + ACCUM_AND_FINALIZE_TAB(7) + + default: { + // m >= 8 case. + + // A temporary buffer has to be used due to the lack of + // registers. But we'll try to accumulate up to 8 AQ codes in + // registers and issue a single write operation to the buffer, + // while the baseline does no accumulation. So, the number of + // write operations to the temporary buffer is reduced 8x. + + // allocate a temporary buffer + std::vector dp(K); + + for (size_t b = 0; b < beam_size; b++) { + // Initialize it. Compute a sum of first 8 AQ codes + // because m >= 8 . + accum_and_store_tab<8, 4>( + m, + codebook_cross_norms, + codebook_offsets, + codes_i, + b, + ldc, + K, + dp.data()); + +#define ACCUM_AND_ADD_TAB(NK) \ + case NK: \ + accum_and_add_tab( \ + m, \ + codebook_cross_norms, \ + codebook_offsets + im, \ + codes_i + im, \ + b, \ + ldc, \ + K, \ + dp.data()); \ + break; + + // accumulate up to 8 additional AQ codes into + // a temporary buffer + for (size_t im = 8; im < ((m + 7) / 8) * 8; im += 8) { + size_t m_left = m - im; + if (m_left > 8) { + m_left = 8; + } + + switch (m_left) { + ACCUM_AND_ADD_TAB(1) + ACCUM_AND_ADD_TAB(2) + ACCUM_AND_ADD_TAB(3) + ACCUM_AND_ADD_TAB(4) + ACCUM_AND_ADD_TAB(5) + ACCUM_AND_ADD_TAB(6) + ACCUM_AND_ADD_TAB(7) + ACCUM_AND_ADD_TAB(8) + } + } + + // done. finalize the result + for (size_t k = 0; k < K; k++) { + cent_distances[b * K + k] = + distances_i[b] + cd_common[k] + 2 * dp[k]; + } + } + } + } + + // the optimized implementation ends here + + using C = CMax; + int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; + float* new_distances_i = new_distances + i * new_beam_size; + + const float* cent_distances_i = cent_distances.data(); + + // then we have to select the best results + for (int i = 0; i < new_beam_size; i++) { + new_distances_i[i] = C::neutral(); + } + std::vector perm(new_beam_size, -1); + +#define HANDLE_APPROX(NB, BD) \ + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ + HeapWithBuckets::bs_addn( \ + beam_size, \ + K, \ + cent_distances_i, \ + new_beam_size, \ + new_distances_i, \ + perm.data()); \ + break; + + switch (approx_topk_mode) { + HANDLE_APPROX(8, 3) + HANDLE_APPROX(8, 2) + HANDLE_APPROX(16, 2) + HANDLE_APPROX(32, 2) + default: + heap_addn( + new_beam_size, + new_distances_i, + perm.data(), + cent_distances_i, + nullptr, + beam_size * K); + break; + } + + heap_reorder(new_beam_size, new_distances_i, perm.data()); + +#undef HANDLE_APPROX + + for (int j = 0; j < new_beam_size; j++) { + int js = perm[j] / K; + int ls = perm[j] % K; + if (m > 0) { + memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m); + } + new_codes_i[m] = ls; + new_codes_i += m + 1; + } + } +} + +/******************************************************************** + * Multiple encoding steps + ********************************************************************/ + +namespace rq_encode_steps { + +void refine_beam_mp( + const ResidualQuantizer& rq, + size_t n, + size_t beam_size, + const float* x, + int out_beam_size, + int32_t* out_codes, + float* out_residuals, + float* out_distances, + RefineBeamMemoryPool& pool) { + int cur_beam_size = beam_size; + + double t0 = getmillisecs(); + + // find the max_beam_size + int max_beam_size = 0; + { + int tmp_beam_size = cur_beam_size; + for (int m = 0; m < rq.M; m++) { + int K = 1 << rq.nbits[m]; + int new_beam_size = std::min(tmp_beam_size * K, out_beam_size); + tmp_beam_size = new_beam_size; + + if (max_beam_size < new_beam_size) { + max_beam_size = new_beam_size; + } + } + } + + // preallocate buffers + pool.new_codes.resize(n * max_beam_size * (rq.M + 1)); + pool.new_residuals.resize(n * max_beam_size * rq.d); + + pool.codes.resize(n * max_beam_size * (rq.M + 1)); + pool.distances.resize(n * max_beam_size); + pool.residuals.resize(n * rq.d * max_beam_size); + + for (size_t i = 0; i < n * rq.d * beam_size; i++) { + pool.residuals[i] = x[i]; + } + + // set up pointers to buffers + int32_t* __restrict codes_ptr = pool.codes.data(); + float* __restrict residuals_ptr = pool.residuals.data(); + + int32_t* __restrict new_codes_ptr = pool.new_codes.data(); + float* __restrict new_residuals_ptr = pool.new_residuals.data(); + + // index + std::unique_ptr assign_index; + if (rq.assign_index_factory) { + assign_index.reset((*rq.assign_index_factory)(rq.d)); + } else { + assign_index.reset(new IndexFlatL2(rq.d)); + } + + // main loop + size_t codes_size = 0; + size_t distances_size = 0; + size_t residuals_size = 0; + + for (int m = 0; m < rq.M; m++) { + int K = 1 << rq.nbits[m]; + + const float* __restrict codebooks_m = + rq.codebooks.data() + rq.codebook_offsets[m] * rq.d; + + const int new_beam_size = std::min(cur_beam_size * K, out_beam_size); + + codes_size = n * new_beam_size * (m + 1); + residuals_size = n * new_beam_size * rq.d; + distances_size = n * new_beam_size; + + beam_search_encode_step( + rq.d, + K, + codebooks_m, + n, + cur_beam_size, + residuals_ptr, + m, + codes_ptr, + new_beam_size, + new_codes_ptr, + new_residuals_ptr, + pool.distances.data(), + assign_index.get(), + rq.approx_topk_mode); + + assign_index->reset(); + + std::swap(codes_ptr, new_codes_ptr); + std::swap(residuals_ptr, new_residuals_ptr); + + cur_beam_size = new_beam_size; + + if (rq.verbose) { + float sum_distances = 0; + for (int j = 0; j < distances_size; j++) { + sum_distances += pool.distances[j]; + } + + printf("[%.3f s] encode stage %d, %d bits, " + "total error %g, beam_size %d\n", + (getmillisecs() - t0) / 1000, + m, + int(rq.nbits[m]), + sum_distances, + cur_beam_size); + } + } + + if (out_codes) { + memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr)); + } + if (out_residuals) { + memcpy(out_residuals, + residuals_ptr, + residuals_size * sizeof(*residuals_ptr)); + } + if (out_distances) { + memcpy(out_distances, + pool.distances.data(), + distances_size * sizeof(pool.distances[0])); + } +} + +void refine_beam_LUT_mp( + const ResidualQuantizer& rq, + size_t n, + const float* query_norms, // size n + const float* query_cp, // + int out_beam_size, + int32_t* out_codes, + float* out_distances, + RefineBeamLUTMemoryPool& pool) { + int beam_size = 1; + + double t0 = getmillisecs(); + + // find the max_beam_size + int max_beam_size = 0; + { + int tmp_beam_size = beam_size; + for (int m = 0; m < rq.M; m++) { + int K = 1 << rq.nbits[m]; + int new_beam_size = std::min(tmp_beam_size * K, out_beam_size); + tmp_beam_size = new_beam_size; + + if (max_beam_size < new_beam_size) { + max_beam_size = new_beam_size; + } + } + } + + // preallocate buffers + pool.new_codes.resize(n * max_beam_size * (rq.M + 1)); + pool.new_distances.resize(n * max_beam_size); + + pool.codes.resize(n * max_beam_size * (rq.M + 1)); + pool.distances.resize(n * max_beam_size); + + for (size_t i = 0; i < n; i++) { + pool.distances[i] = query_norms[i]; + } + + // set up pointers to buffers + int32_t* __restrict new_codes_ptr = pool.new_codes.data(); + float* __restrict new_distances_ptr = pool.new_distances.data(); + + int32_t* __restrict codes_ptr = pool.codes.data(); + float* __restrict distances_ptr = pool.distances.data(); + + // main loop + size_t codes_size = 0; + size_t distances_size = 0; + for (int m = 0; m < rq.M; m++) { + int K = 1 << rq.nbits[m]; + + // it is guaranteed that (new_beam_size <= max_beam_size) + int new_beam_size = std::min(beam_size * K, out_beam_size); + + codes_size = n * new_beam_size * (m + 1); + distances_size = n * new_beam_size; + + beam_search_encode_step_tab( + K, + n, + beam_size, + rq.codebook_cross_products.data() + rq.codebook_offsets[m], + rq.total_codebook_size, + rq.codebook_offsets.data(), + query_cp + rq.codebook_offsets[m], + rq.total_codebook_size, + rq.cent_norms.data() + rq.codebook_offsets[m], + m, + codes_ptr, + distances_ptr, + new_beam_size, + new_codes_ptr, + new_distances_ptr, + rq.approx_topk_mode); + + std::swap(codes_ptr, new_codes_ptr); + std::swap(distances_ptr, new_distances_ptr); + + beam_size = new_beam_size; + + if (rq.verbose) { + float sum_distances = 0; + for (int j = 0; j < distances_size; j++) { + sum_distances += distances_ptr[j]; + } + printf("[%.3f s] encode stage %d, %d bits, " + "total error %g, beam_size %d\n", + (getmillisecs() - t0) / 1000, + m, + int(rq.nbits[m]), + sum_distances, + beam_size); + } + } + + if (out_codes) { + memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr)); + } + if (out_distances) { + memcpy(out_distances, + distances_ptr, + distances_size * sizeof(*distances_ptr)); + } +} + +// this is for use_beam_LUT == 0 +void compute_codes_add_centroids_mp_lut0( + const ResidualQuantizer& rq, + const float* x, + uint8_t* codes_out, + size_t n, + const float* centroids, + ComputeCodesAddCentroidsLUT0MemoryPool& pool) { + pool.codes.resize(rq.max_beam_size * rq.M * n); + pool.distances.resize(rq.max_beam_size * n); + + pool.residuals.resize(rq.max_beam_size * n * rq.d); + + refine_beam_mp( + rq, + n, + 1, + x, + rq.max_beam_size, + pool.codes.data(), + pool.residuals.data(), + pool.distances.data(), + pool.refine_beam_pool); + + if (rq.search_type == ResidualQuantizer::ST_norm_float || + rq.search_type == ResidualQuantizer::ST_norm_qint8 || + rq.search_type == ResidualQuantizer::ST_norm_qint4) { + pool.norms.resize(n); + // recover the norms of reconstruction as + // || original_vector - residual ||^2 + for (size_t i = 0; i < n; i++) { + pool.norms[i] = fvec_L2sqr( + x + i * rq.d, + pool.residuals.data() + i * rq.max_beam_size * rq.d, + rq.d); + } + } + + // pack only the first code of the beam + // (hence the ld_codes=M * max_beam_size) + rq.pack_codes( + n, + pool.codes.data(), + codes_out, + rq.M * rq.max_beam_size, + (pool.norms.size() > 0) ? pool.norms.data() : nullptr, + centroids); +} + +// use_beam_LUT == 1 +void compute_codes_add_centroids_mp_lut1( + const ResidualQuantizer& rq, + const float* x, + uint8_t* codes_out, + size_t n, + const float* centroids, + ComputeCodesAddCentroidsLUT1MemoryPool& pool) { + // + pool.codes.resize(rq.max_beam_size * rq.M * n); + pool.distances.resize(rq.max_beam_size * n); + + FAISS_THROW_IF_NOT_MSG( + rq.codebook_cross_products.size() == + rq.total_codebook_size * rq.total_codebook_size, + "call compute_codebook_tables first"); + + pool.query_norms.resize(n); + fvec_norms_L2sqr(pool.query_norms.data(), x, rq.d, n); + + pool.query_cp.resize(n * rq.total_codebook_size); + { + FINTEGER ti = rq.total_codebook_size, di = rq.d, ni = n; + float zero = 0, one = 1; + sgemm_("Transposed", + "Not transposed", + &ti, + &ni, + &di, + &one, + rq.codebooks.data(), + &di, + x, + &di, + &zero, + pool.query_cp.data(), + &ti); + } + + refine_beam_LUT_mp( + rq, + n, + pool.query_norms.data(), + pool.query_cp.data(), + rq.max_beam_size, + pool.codes.data(), + pool.distances.data(), + pool.refine_beam_lut_pool); + + // pack only the first code of the beam + // (hence the ld_codes=M * max_beam_size) + rq.pack_codes( + n, + pool.codes.data(), + codes_out, + rq.M * rq.max_beam_size, + nullptr, + centroids); +} + +} // namespace rq_encode_steps + +} // namespace faiss diff --git a/faiss/impl/residual_quantizer_encode_steps.h b/faiss/impl/residual_quantizer_encode_steps.h new file mode 100644 index 0000000000..add101b477 --- /dev/null +++ b/faiss/impl/residual_quantizer_encode_steps.h @@ -0,0 +1,172 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace faiss { + +/******************************************************************** + * Single step of encoding + ********************************************************************/ + +/** Encode a residual by sampling from a centroid table. + * + * This is a single encoding step the residual quantizer. + * It allows low-level access to the encoding function, exposed mainly for unit + * tests. + * + * @param n number of vectors to hanlde + * @param residuals vectors to encode, size (n, beam_size, d) + * @param cent centroids, size (K, d) + * @param beam_size input beam size + * @param m size of the codes for the previous encoding steps + * @param codes code array for the previous steps of the beam (n, + * beam_size, m) + * @param new_beam_size output beam size (should be <= K * beam_size) + * @param new_codes output codes, size (n, new_beam_size, m + 1) + * @param new_residuals output residuals, size (n, new_beam_size, d) + * @param new_distances output distances, size (n, new_beam_size) + * @param assign_index if non-NULL, will be used to perform assignment + */ +void beam_search_encode_step( + size_t d, + size_t K, + const float* cent, + size_t n, + size_t beam_size, + const float* residuals, + size_t m, + const int32_t* codes, + size_t new_beam_size, + int32_t* new_codes, + float* new_residuals, + float* new_distances, + Index* assign_index = nullptr, + ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK); + +/** Encode a set of vectors using their dot products with the codebooks + * + * @param K number of vectors in the codebook + * @param n nb of vectors to encode + * @param beam_size input beam size + * @param codebook_cross_norms inner product of this codebook with the m + * previously encoded codebooks + * @param codebook_offsets offsets into codebook_cross_norms for each + * previous codebook + * @param query_cp dot products of query vectors with ??? + * @param cent_norms_i norms of centroids + */ +void beam_search_encode_step_tab( + size_t K, + size_t n, + size_t beam_size, // input sizes + const float* codebook_cross_norms, // size K * ldc + size_t ldc, // >= K + const uint64_t* codebook_offsets, // m + const float* query_cp, // size n * ldqc + size_t ldqc, // >= K + const float* cent_norms_i, // size K + size_t m, + const int32_t* codes, // n * beam_size * m + const float* distances, // n * beam_size + size_t new_beam_size, + int32_t* new_codes, // n * new_beam_size * (m + 1) + float* new_distances, // n * new_beam_size + ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK); + +/******************************************************************** + * Multiple encoding steps + ********************************************************************/ + +struct ResidualQuantizer; + +namespace rq_encode_steps { + +// Preallocated memory chunk for refine_beam_mp() call +struct RefineBeamMemoryPool { + std::vector new_codes; + std::vector new_residuals; + + std::vector residuals; + std::vector codes; + std::vector distances; +}; + +void refine_beam_mp( + const ResidualQuantizer& rq, + size_t n, + size_t beam_size, + const float* x, + int out_beam_size, + int32_t* out_codes, + float* out_residuals, + float* out_distances, + RefineBeamMemoryPool& pool); + +// Preallocated memory chunk for refine_beam_LUT_mp() call +struct RefineBeamLUTMemoryPool { + std::vector new_codes; + std::vector new_distances; + + std::vector codes; + std::vector distances; +}; + +void refine_beam_LUT_mp( + const ResidualQuantizer& rq, + size_t n, + const float* query_norms, // size n + const float* query_cp, // + int out_beam_size, + int32_t* out_codes, + float* out_distances, + RefineBeamLUTMemoryPool& pool); + +// this is for use_beam_LUT == 0 in compute_codes_add_centroids_mp_lut0() call +struct ComputeCodesAddCentroidsLUT0MemoryPool { + std::vector codes; + std::vector norms; + std::vector distances; + std::vector residuals; + RefineBeamMemoryPool refine_beam_pool; +}; + +void compute_codes_add_centroids_mp_lut0( + const ResidualQuantizer& rq, + const float* x, + uint8_t* codes_out, + size_t n, + const float* centroids, + ComputeCodesAddCentroidsLUT0MemoryPool& pool); + +// this is for use_beam_LUT == 1 in compute_codes_add_centroids_mp_lut1() call +struct ComputeCodesAddCentroidsLUT1MemoryPool { + std::vector codes; + std::vector distances; + std::vector query_norms; + std::vector query_cp; + std::vector residuals; + RefineBeamLUTMemoryPool refine_beam_lut_pool; +}; + +void compute_codes_add_centroids_mp_lut1( + const ResidualQuantizer& rq, + const float* x, + uint8_t* codes_out, + size_t n, + const float* centroids, + ComputeCodesAddCentroidsLUT1MemoryPool& pool); + +} // namespace rq_encode_steps + +} // namespace faiss diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py index d650033096..427cb31625 100644 --- a/faiss/python/__init__.py +++ b/faiss/python/__init__.py @@ -298,10 +298,10 @@ def serialize_index(index): return vector_to_array(writer.data) -def deserialize_index(data): +def deserialize_index(data, io_flags=0): reader = VectorIOReader() copy_array_to_vector(data, reader.data) - return read_index(reader) + return read_index(reader, io_flags) def serialize_index_binary(index): diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index 852690622b..3d6f94604a 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -137,6 +137,8 @@ typedef uint64_t size_t; #include #include #include +#include + #include #include #include @@ -415,6 +417,7 @@ void gpu_sync_all_devices() %include %include %include +%include %include %include %include diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py index bb853c09d5..e2330fcb9e 100644 --- a/tests/test_residual_quantizer.py +++ b/tests/test_residual_quantizer.py @@ -642,15 +642,13 @@ def test_rcq_LUT(self): np.testing.assert_array_almost_equal(CDref, CDnew, decimal=5) np.testing.assert_array_equal(CIref, CInew) - def test_norms_oom(self): - "check if allocating too large norms tables raises an exception" - index = faiss.index_factory(32, "RQ20x8") - try: - index.train(np.zeros((100, 32), dtype="float32")) - except RuntimeError: - pass # ok - else: - self.assertFalse() + # check that you can load the index without computing the tables + quantizer.set_beam_factor(2.0) + self.assertNotEqual(quantizer.rq.codebook_cross_products.size(), 0) + quantizer3 = faiss.deserialize_index( + faiss.serialize_index(quantizer), faiss.IO_FLAG_SKIP_PRECOMPUTE_TABLE) + self.assertEqual(quantizer3.rq.codebook_cross_products.size(), 0) + CD3, CI3 = quantizer3.search(ds.get_queries(), 10) ###########################################################