|
11 | 11 | #include "xss-common-qsort.h" |
12 | 12 | #include "xss-network-keyvaluesort.hpp" |
13 | 13 |
|
| 14 | +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 15 | +#define XSS_COMPILE_OPENMP |
| 16 | +#include <omp.h> |
| 17 | +#endif |
| 18 | + |
14 | 19 | /* |
15 | 20 | * Parition one ZMM register based on the pivot and returns the index of the |
16 | 21 | * last element that is less than equal to the pivot. |
@@ -393,7 +398,7 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, |
393 | 398 | arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4>( |
394 | 399 | keys, indexes, left, right + 1, pivot, &smallest, &biggest); |
395 | 400 |
|
396 | | -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 401 | +#ifdef XSS_COMPILE_OPENMP |
397 | 402 | if (pivot != smallest) { |
398 | 403 | bool parallel_left = (pivot_index - left) > task_threshold; |
399 | 404 | if (parallel_left) { |
@@ -534,18 +539,28 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( |
534 | 539 | UNUSED(hasnan); |
535 | 540 | } |
536 | 541 |
|
537 | | -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) |
| 542 | +#ifdef XSS_COMPILE_OPENMP |
| 543 | + |
538 | 544 | bool use_parallel = arrsize > 10000; |
539 | | - arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); |
| 545 | + |
540 | 546 | if (use_parallel) { |
541 | | -#pragma omp parallel |
| 547 | + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system |
| 548 | + constexpr int thread_limit = 8; |
| 549 | + int thread_count = std::min(thread_limit, omp_get_max_threads()); |
| 550 | + arrsize_t task_threshold |
| 551 | + = std::max((arrsize_t)10000, arrsize / 100); |
| 552 | + |
| 553 | + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_ |
| 554 | + // The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems |
| 555 | + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays |
| 556 | +#pragma omp parallel num_threads(thread_count) |
542 | 557 | #pragma omp single |
543 | 558 | kvsort_<keytype, valtype>( |
544 | 559 | keys, indexes, 0, arrsize - 1, maxiters, task_threshold); |
545 | 560 | } |
546 | 561 | else { |
547 | 562 | kvsort_<keytype, valtype>( |
548 | | - keys, indexes, 0, arrsize - 1, maxiters, task_threshold); |
| 563 | + keys, indexes, 0, arrsize - 1, maxiters, 0); |
549 | 564 | } |
550 | 565 | #else |
551 | 566 | kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters, 0); |
|
0 commit comments