diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp
index 138840ff34..ac0672cd99 100644
--- a/faiss/IndexFastScan.cpp
+++ b/faiss/IndexFastScan.cpp
@@ -323,7 +323,7 @@ void IndexFastScan::search_dispatch_implem(
             }
         } else {
             // explicitly slice over threads
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
             for (int slice = 0; slice < nt; slice++) {
                 idx_t i0 = n * slice / nt;
                 idx_t i1 = n * (slice + 1) / nt;
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index 4e70220b37..3efd5b8fea 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -1006,7 +1006,7 @@ void IndexIVF::search_and_reconstruct(
             labels,
             true /* store_pairs */,
             params);
-#pragma omp parallel for if (n * k > 1000)
+#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads)
     for (idx_t ij = 0; ij < n * k; ij++) {
         idx_t key = labels[ij];
         float* reconstructed = recons + ij * d;
@@ -1068,7 +1068,7 @@ void IndexIVF::search_and_return_codes(
         code_size_1 += coarse_code_size();
     }
 
-#pragma omp parallel for if (n * k > 1000)
+#pragma omp parallel for if (n * k > 1000) num_threads(num_omp_threads)
     for (idx_t ij = 0; ij < n * k; ij++) {
         idx_t key = labels[ij];
         uint8_t* code1 = codes + ij * code_size_1;
diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
index 2ea0b3fba9..735c04fca6 100644
--- a/faiss/IndexIVFFastScan.cpp
+++ b/faiss/IndexIVFFastScan.cpp
@@ -640,7 +640,7 @@ void IndexIVFFastScan::range_search_dispatch_implem(
     } else {
         // explicitly slice over threads
         int nslice = compute_search_nslice(this, n, cq.nprobe);
-#pragma omp parallel
+#pragma omp parallel num_threads(num_omp_threads)
         {
             RangeSearchPartialResult pres(&rres);
 
diff --git a/faiss/impl/PolysemousTraining.cpp b/faiss/impl/PolysemousTraining.cpp
index ad1ce52401..06f99340c1 100644
--- a/faiss/impl/PolysemousTraining.cpp
+++ b/faiss/impl/PolysemousTraining.cpp
@@ -779,7 +779,7 @@ void PolysemousTraining::optimize_reproduce_distances(
                 nt);
     }
 
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int m = 0; m < pq.M; m++) {
         std::vector<double> dis_table;
 
diff --git a/faiss/impl/ProductQuantizer.cpp b/faiss/impl/ProductQuantizer.cpp
index 76aab28916..12f0bf91b7 100644
--- a/faiss/impl/ProductQuantizer.cpp
+++ b/faiss/impl/ProductQuantizer.cpp
@@ -313,7 +313,7 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const {
 }
 
 void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
-#pragma omp parallel for if (n > 100)
+#pragma omp parallel for if (n > 100) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         this->decode(code + code_size * i, x + d * i);
     }
diff --git a/faiss/impl/residual_quantizer_encode_steps.cpp b/faiss/impl/residual_quantizer_encode_steps.cpp
index 8db6f9e5f7..812a8a7af3 100644
--- a/faiss/impl/residual_quantizer_encode_steps.cpp
+++ b/faiss/impl/residual_quantizer_encode_steps.cpp
@@ -275,7 +275,7 @@ void beam_search_encode_step(
     }
     InterruptCallback::check();
 
-#pragma omp parallel for if (n > 100)
+#pragma omp parallel for if (n > 100) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* codes_i = codes + i * m * beam_size;
         int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
@@ -399,7 +399,7 @@ void beam_search_encode_step_tab(
 {
     FAISS_THROW_IF_NOT(ldc >= K);
 
-#pragma omp parallel for if (n > 100) schedule(dynamic)
+#pragma omp parallel for if (n > 100) schedule(dynamic) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         std::vector<float> cent_distances(beam_size * K);
         std::vector<float> cd_common(K);
diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
index 7d43c5d850..ea89237484 100644
--- a/faiss/utils/distances.cpp
+++ b/faiss/utils/distances.cpp
@@ -146,7 +146,7 @@ void exhaustive_inner_product_seq(
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
-#pragma omp parallel num_threads(nt)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         SingleResultHandler resi(res);
 #pragma omp for
@@ -183,7 +183,7 @@ void exhaustive_L2sqr_seq(
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
-#pragma omp parallel num_threads(nt)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         SingleResultHandler resi(res);
 #pragma omp for
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
index 22b0c73442..9700e4350f 100644
--- a/faiss/utils/hamming.cpp
+++ b/faiss/utils/hamming.cpp
@@ -293,7 +293,7 @@ void hamming_range_search(
         int radius,
         size_t code_size,
         RangeSearchResult* res) {
-#pragma omp parallel
+#pragma omp parallel num_threads(num_omp_threads)
     {
         RangeSearchPartialResult pres(res);
 
@@ -687,7 +687,7 @@ void pack_bitstrings(
         uint8_t* packed,
         size_t code_size) {
     FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* in = unpacked + i * M;
         uint8_t* out = packed + i * code_size;
@@ -710,7 +710,7 @@ void pack_bitstrings(
         totbit += nbit[j];
     }
     FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const int32_t* in = unpacked + i * M;
         uint8_t* out = packed + i * code_size;
@@ -729,7 +729,7 @@ void unpack_bitstrings(
         size_t code_size,
         int32_t* unpacked) {
     FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const uint8_t* in = packed + i * code_size;
         int32_t* out = unpacked + i * M;
@@ -752,7 +752,7 @@ void unpack_bitstrings(
         totbit += nbit[j];
     }
     FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 1000) num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         const uint8_t* in = packed + i * code_size;
         int32_t* out = unpacked + i * M;
diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp
index b1021d85c9..ba57bad866 100644
--- a/faiss/utils/sorting.cpp
+++ b/faiss/utils/sorting.cpp
@@ -61,7 +61,7 @@ void parallel_merge(
     s2s[nt - 1].i1 = s2.i1;
 
     // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int t = 0; t < nt; t++) {
         s1s[t].i0 = s1.i0 + s1.len() * t / nt;
         s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
@@ -93,7 +93,7 @@ void parallel_merge(
     assert(sws[nt - 1].i1 == s1.i1);
 
     // do the actual merging
-#pragma omp parallel for num_threads(nt)
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int t = 0; t < nt; t++) {
         SegmentS sw = sws[t];
         SegmentS s1t = s1s[t];
@@ -176,7 +176,7 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
         int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
         int sub_nseg1 = nseg / 2;
 
-#pragma omp parallel for num_threads(nseg1)
+#pragma omp parallel for num_threads(num_omp_threads)
         for (int s = 0; s < nseg; s += 2) {
             if (s + 1 == nseg) { // otherwise isolated segment
                 memcpy(permB + segs[s].i0,
@@ -257,7 +257,7 @@ void bucket_sort_parallel(
         int64_t* perm,
         int nt_in) {
     memset(lims, 0, sizeof(*lims) * (vmax + 1));
-#pragma omp parallel num_threads(nt_in)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         int nt = omp_get_num_threads(); // might be different from nt_in
         int rank = omp_get_thread_num();
@@ -483,7 +483,7 @@ void bucket_sort_inplace_parallel(
             nbucket); // DON'T use std::vector<bool> that cannot be accessed
                       // safely from multiple threads!!!
 
-#pragma omp parallel num_threads(nt_in)
+#pragma omp parallel num_threads(num_omp_threads)
     {
         int nt = omp_get_num_threads(); // might be different from nt_in (?)
         int rank = omp_get_thread_num();
@@ -709,7 +709,7 @@ inline int64_t hash_function(int64_t x) {
 
 void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
     size_t capacity = (size_t)1 << log2_capacity;
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < capacity; i++) {
         tab[2 * i] = -1;
         tab[2 * i + 1] = -1;
@@ -729,7 +729,7 @@ void hashtable_int64_to_int64_add(
     int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
     size_t nbucket = (size_t)1 << log2_nbucket;
 
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         hk[i] = hash_function(keys[i]) & mask;
         bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
@@ -746,7 +746,7 @@ void hashtable_int64_to_int64_add(
             omp_get_max_threads());
 
     int num_errors = 0;
-#pragma omp parallel for reduction(+ : num_errors)
+#pragma omp parallel for reduction(+ : num_errors) num_threads(num_omp_threads)
     for (int64_t bucket = 0; bucket < nbucket; bucket++) {
         size_t k0 = bucket << (log2_capacity - log2_nbucket);
         size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
@@ -793,7 +793,7 @@ void hashtable_int64_to_int64_lookup(
     int64_t mask = capacity - 1;
     int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
 
-#pragma omp parallel for
+#pragma omp parallel for num_threads(num_omp_threads)
     for (int64_t i = 0; i < n; i++) {
         int64_t k = keys[i];
         int64_t hk = hash_function(k) & mask;
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
index 241c509ee4..c8dcf82995 100644
--- a/faiss/utils/utils.cpp
+++ b/faiss/utils/utils.cpp
@@ -455,7 +455,7 @@ void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
     // so below codes only accept n <= std::numeric_limits<ssize_t>::max()
     using ssize_t = std::make_signed<std::size_t>::type;
     const ssize_t size = n;
-#pragma omp parallel for if (size > 1000)
+#pragma omp parallel for if (size > 1000) num_threads(num_omp_threads)
     for (ssize_t i_ = 0; i_ < size; i_++) {
         const auto i = static_cast<std::size_t>(i_);
         cs[i] = bvec_checksum(d, a + i * d);