diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp index 138840ff34..ac0672cd99 100644 --- a/faiss/IndexFastScan.cpp +++ b/faiss/IndexFastScan.cpp @@ -323,7 +323,7 @@ void IndexFastScan::search_dispatch_implem( } } else { // explicitly slice over threads -#pragma omp parallel for num_threads(nt) +#pragma omp parallel for num_threads(num_omp_threads) for (int slice = 0; slice < nt; slice++) { idx_t i0 = n * slice / nt; idx_t i1 = n * (slice + 1) / nt; diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp index 4e70220b37..3efd5b8fea 100644 --- a/faiss/IndexIVF.cpp +++ b/faiss/IndexIVF.cpp @@ -1006,7 +1006,7 @@ void IndexIVF::search_and_reconstruct( labels, true /* store_pairs */, params); -#pragma omp parallel for if (n * k > 1000) +#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads) for (idx_t ij = 0; ij < n * k; ij++) { idx_t key = labels[ij]; float* reconstructed = recons + ij * d; @@ -1068,7 +1068,7 @@ void IndexIVF::search_and_return_codes( code_size_1 += coarse_code_size(); } -#pragma omp parallel for if (n * k > 1000) +#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads) for (idx_t ij = 0; ij < n * k; ij++) { idx_t key = labels[ij]; uint8_t* code1 = codes + ij * code_size_1; diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp index 2ea0b3fba9..735c04fca6 100644 --- a/faiss/IndexIVFFastScan.cpp +++ b/faiss/IndexIVFFastScan.cpp @@ -640,7 +640,7 @@ void IndexIVFFastScan::range_search_dispatch_implem( } else { // explicitly slice over threads int nslice = compute_search_nslice(this, n, cq.nprobe); -#pragma omp parallel +#pragma omp parallel num_threads(num_omp_threads) { RangeSearchPartialResult pres(&rres); diff --git a/faiss/impl/PolysemousTraining.cpp b/faiss/impl/PolysemousTraining.cpp index ad1ce52401..06f99340c1 100644 --- a/faiss/impl/PolysemousTraining.cpp +++ b/faiss/impl/PolysemousTraining.cpp @@ -779,7 +779,7 @@ void PolysemousTraining::optimize_reproduce_distances( nt); } -#pragma omp parallel for num_threads(nt) +#pragma omp parallel for num_threads(num_omp_threads) for (int m = 0; m < pq.M; m++) { std::vector dis_table; diff --git a/faiss/impl/ProductQuantizer.cpp b/faiss/impl/ProductQuantizer.cpp index 76aab28916..12f0bf91b7 100644 --- a/faiss/impl/ProductQuantizer.cpp +++ b/faiss/impl/ProductQuantizer.cpp @@ -313,7 +313,7 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const { } void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const { -#pragma omp parallel for if (n > 100) +#pragma omp parallel for if (n > 100) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { this->decode(code + code_size * i, x + d * i); } diff --git a/faiss/impl/residual_quantizer_encode_steps.cpp b/faiss/impl/residual_quantizer_encode_steps.cpp index 8db6f9e5f7..812a8a7af3 100644 --- a/faiss/impl/residual_quantizer_encode_steps.cpp +++ b/faiss/impl/residual_quantizer_encode_steps.cpp @@ -275,7 +275,7 @@ void beam_search_encode_step( } InterruptCallback::check(); -#pragma omp parallel for if (n > 100) +#pragma omp parallel for if (n > 100) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { const int32_t* codes_i = codes + i * m * beam_size; int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size; @@ -399,7 +399,7 @@ void beam_search_encode_step_tab( { FAISS_THROW_IF_NOT(ldc >= K); -#pragma omp parallel for if (n > 100) schedule(dynamic) +#pragma omp parallel for if (n > 100) schedule(dynamic) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { std::vector cent_distances(beam_size * K); std::vector cd_common(K); diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp index 7d43c5d850..ea89237484 100644 --- a/faiss/utils/distances.cpp +++ b/faiss/utils/distances.cpp @@ -146,7 +146,7 @@ void exhaustive_inner_product_seq( FAISS_ASSERT(use_sel == (sel != nullptr)); -#pragma omp parallel num_threads(nt) +#pragma omp parallel num_threads(num_omp_threads) { SingleResultHandler resi(res); #pragma omp for @@ -183,7 +183,7 @@ void exhaustive_L2sqr_seq( FAISS_ASSERT(use_sel == (sel != nullptr)); -#pragma omp parallel num_threads(nt) +#pragma omp parallel num_threads(num_omp_threads) { SingleResultHandler resi(res); #pragma omp for diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 22b0c73442..9700e4350f 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -293,7 +293,7 @@ void hamming_range_search( int radius, size_t code_size, RangeSearchResult* res) { -#pragma omp parallel +#pragma omp parallel num_threads(num_omp_threads) { RangeSearchPartialResult pres(res); @@ -687,7 +687,7 @@ void pack_bitstrings( uint8_t* packed, size_t code_size) { FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8); -#pragma omp parallel for if (n > 1000) +#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { const int32_t* in = unpacked + i * M; uint8_t* out = packed + i * code_size; @@ -710,7 +710,7 @@ void pack_bitstrings( totbit += nbit[j]; } FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8); -#pragma omp parallel for if (n > 1000) +#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { const int32_t* in = unpacked + i * M; uint8_t* out = packed + i * code_size; @@ -729,7 +729,7 @@ void unpack_bitstrings( size_t code_size, int32_t* unpacked) { FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8); -#pragma omp parallel for if (n > 1000) +#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { const uint8_t* in = packed + i * code_size; int32_t* out = unpacked + i * M; @@ -752,7 +752,7 @@ void unpack_bitstrings( totbit += nbit[j]; } FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8); -#pragma omp parallel for if (n > 1000) +#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { const uint8_t* in = packed + i * code_size; int32_t* out = unpacked + i * M; diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp index b1021d85c9..ba57bad866 100644 --- a/faiss/utils/sorting.cpp +++ b/faiss/utils/sorting.cpp @@ -61,7 +61,7 @@ void parallel_merge( s2s[nt - 1].i1 = s2.i1; // not sure parallel actually helps here -#pragma omp parallel for num_threads(nt) +#pragma omp parallel for num_threads(num_omp_threads) for (int t = 0; t < nt; t++) { s1s[t].i0 = s1.i0 + s1.len() * t / nt; s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt; @@ -93,7 +93,7 @@ void parallel_merge( assert(sws[nt - 1].i1 == s1.i1); // do the actual merging -#pragma omp parallel for num_threads(nt) +#pragma omp parallel for num_threads(num_omp_threads) for (int t = 0; t < nt; t++) { SegmentS sw = sws[t]; SegmentS s1t = s1s[t]; @@ -176,7 +176,7 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) { int sub_nt = nseg % 2 == 0 ? nt : nt - 1; int sub_nseg1 = nseg / 2; -#pragma omp parallel for num_threads(nseg1) +#pragma omp parallel for num_threads(num_omp_threads) for (int s = 0; s < nseg; s += 2) { if (s + 1 == nseg) { // otherwise isolated segment memcpy(permB + segs[s].i0, @@ -257,7 +257,7 @@ void bucket_sort_parallel( int64_t* perm, int nt_in) { memset(lims, 0, sizeof(*lims) * (vmax + 1)); -#pragma omp parallel num_threads(nt_in) +#pragma omp parallel num_threads(num_omp_threads) { int nt = omp_get_num_threads(); // might be different from nt_in int rank = omp_get_thread_num(); @@ -483,7 +483,7 @@ void bucket_sort_inplace_parallel( nbucket); // DON'T use std::vector that cannot be accessed // safely from multiple threads!!! -#pragma omp parallel num_threads(nt_in) +#pragma omp parallel num_threads(num_omp_threads) { int nt = omp_get_num_threads(); // might be different from nt_in (?) int rank = omp_get_thread_num(); @@ -709,7 +709,7 @@ inline int64_t hash_function(int64_t x) { void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) { size_t capacity = (size_t)1 << log2_capacity; -#pragma omp parallel for +#pragma omp parallel for num_threads(num_omp_threads) for (int64_t i = 0; i < capacity; i++) { tab[2 * i] = -1; tab[2 * i + 1] = -1; @@ -729,7 +729,7 @@ void hashtable_int64_to_int64_add( int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity); size_t nbucket = (size_t)1 << log2_nbucket; -#pragma omp parallel for +#pragma omp parallel for num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { hk[i] = hash_function(keys[i]) & mask; bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket); @@ -746,7 +746,7 @@ void hashtable_int64_to_int64_add( omp_get_max_threads()); int num_errors = 0; -#pragma omp parallel for reduction(+ : num_errors) +#pragma omp parallel for reduction(+ : num_errors) num_threads(num_omp_threads) for (int64_t bucket = 0; bucket < nbucket; bucket++) { size_t k0 = bucket << (log2_capacity - log2_nbucket); size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket); @@ -793,7 +793,7 @@ void hashtable_int64_to_int64_lookup( int64_t mask = capacity - 1; int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity); -#pragma omp parallel for +#pragma omp parallel for num_threads(num_omp_threads) for (int64_t i = 0; i < n; i++) { int64_t k = keys[i]; int64_t hk = hash_function(k) & mask; diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp index 241c509ee4..c8dcf82995 100644 --- a/faiss/utils/utils.cpp +++ b/faiss/utils/utils.cpp @@ -455,7 +455,7 @@ void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) { // so below codes only accept n <= std::numeric_limits::max() using ssize_t = std::make_signed::type; const ssize_t size = n; -#pragma omp parallel for if (size > 1000) +#pragma omp parallel for if (size > 1000) num_threads(num_omp_threads) for (ssize_t i_ = 0; i_ < size; i_++) { const auto i = static_cast(i_); cs[i] = bvec_checksum(d, a + i * d);