1111#include " xss-common-qsort.h"
1212#include " xss-network-keyvaluesort.hpp"
1313
14+ #if defined(XSS_USE_OPENMP) && defined(_OPENMP)
15+ #define XSS_COMPILE_OPENMP
16+ #include < omp.h>
17+ #endif
18+
1419/*
1520 * Parition one ZMM register based on the pivot and returns the index of the
1621 * last element that is less than equal to the pivot.
@@ -366,7 +371,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
366371 type2_t *indexes,
367372 arrsize_t left,
368373 arrsize_t right,
369- int max_iters)
374+ int max_iters,
375+ arrsize_t task_threshold)
370376{
371377 /*
372378 * Resort to std::sort if quicksort isnt making any progress
@@ -391,14 +397,61 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
391397 type1_t biggest = vtype1::type_min ();
392398 arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4 >(
393399 keys, indexes, left, right + 1 , pivot, &smallest, &biggest);
400+
401+ #ifdef XSS_COMPILE_OPENMP
402+ if (pivot != smallest) {
403+ bool parallel_left = (pivot_index - left) > task_threshold;
404+ if (parallel_left) {
405+ #pragma omp task
406+ kvsort_<vtype1, vtype2>(keys,
407+ indexes,
408+ left,
409+ pivot_index - 1 ,
410+ max_iters - 1 ,
411+ task_threshold);
412+ }
413+ else {
414+ kvsort_<vtype1, vtype2>(keys,
415+ indexes,
416+ left,
417+ pivot_index - 1 ,
418+ max_iters - 1 ,
419+ task_threshold);
420+ }
421+ }
422+ if (pivot != biggest) {
423+ bool parallel_right = (right - pivot_index) > task_threshold;
424+
425+ if (parallel_right) {
426+ #pragma omp task
427+ kvsort_<vtype1, vtype2>(keys,
428+ indexes,
429+ pivot_index,
430+ right,
431+ max_iters - 1 ,
432+ task_threshold);
433+ }
434+ else {
435+ kvsort_<vtype1, vtype2>(keys,
436+ indexes,
437+ pivot_index,
438+ right,
439+ max_iters - 1 ,
440+ task_threshold);
441+ }
442+ }
443+ #else
444+ UNUSED (task_threshold);
445+
394446 if (pivot != smallest) {
395447 kvsort_<vtype1, vtype2>(
396- keys, indexes, left, pivot_index - 1 , max_iters - 1 );
448+ keys, indexes, left, pivot_index - 1 , max_iters - 1 , 0 );
397449 }
398450 if (pivot != biggest) {
399451 kvsort_<vtype1, vtype2>(
400- keys, indexes, pivot_index, right, max_iters - 1 );
452+ keys, indexes, pivot_index, right, max_iters - 1 , 0 );
401453 }
454+ #endif
402455}
403456
404457template <typename vtype1,
@@ -486,7 +539,33 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
486539 UNUSED (hasnan);
487540 }
488541
489- kvsort_<keytype, valtype>(keys, indexes, 0 , arrsize - 1 , maxiters);
542+ #ifdef XSS_COMPILE_OPENMP
543+
544+ bool use_parallel = arrsize > 10000 ;
545+
546+ if (use_parallel) {
547+ // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
548+ constexpr int thread_limit = 8 ;
549+ int thread_count = std::min (thread_limit, omp_get_max_threads ());
550+ arrsize_t task_threshold
551+ = std::max ((arrsize_t )10000 , arrsize / 100 );
552+
553+ // We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_
554+ // The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems
555+ // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
556+ #pragma omp parallel num_threads(thread_count)
557+ #pragma omp single
558+ kvsort_<keytype, valtype>(
559+ keys, indexes, 0 , arrsize - 1 , maxiters, task_threshold);
560+ }
561+ else {
562+ kvsort_<keytype, valtype>(
563+ keys, indexes, 0 , arrsize - 1 , maxiters, 0 );
564+ }
565+ #else
566+ kvsort_<keytype, valtype>(keys, indexes, 0 , arrsize - 1 , maxiters, 0 );
567+ #endif
568+
490569 replace_inf_with_nan (keys, arrsize, nan_count);
491570
492571 if (descending) {
0 commit comments