@@ -384,21 +384,25 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,
384384 const int64_t left,
385385 const int64_t right)
386386{
387- // median of 8
387+ // median of 8x8 elements
388388 int64_t size = (right - left) / 8 ;
389389 using zmm_t = typename vtype::zmm_t ;
390- __m512i rand_index = _mm512_set_epi64 (left + size,
391- left + 2 * size,
392- left + 3 * size,
393- left + 4 * size,
394- left + 5 * size,
395- left + 6 * size,
396- left + 7 * size,
397- left + 8 * size);
398- zmm_t rand_vec = vtype::template i64gather<sizeof (type_t )>(rand_index, arr);
390+ zmm_t v[8 ];
391+ for (int64_t ii = 0 ; ii < 8 ; ++ii) {
392+ v[ii] = vtype::loadu (arr + left + ii*size);
393+ }
394+ COEX<vtype>(v[0 ], v[1 ]); COEX<vtype>(v[2 ], v[3 ]); /* step 1 */
395+ COEX<vtype>(v[4 ], v[5 ]); COEX<vtype>(v[6 ], v[7 ]);
396+ COEX<vtype>(v[0 ], v[2 ]); COEX<vtype>(v[1 ], v[3 ]); /* step 2 */
397+ COEX<vtype>(v[4 ], v[6 ]); COEX<vtype>(v[5 ], v[7 ]);
398+ COEX<vtype>(v[0 ], v[4 ]); COEX<vtype>(v[1 ], v[2 ]); /* step 3 */
399+ COEX<vtype>(v[5 ], v[6 ]); COEX<vtype>(v[3 ], v[7 ]);
400+ COEX<vtype>(v[1 ], v[5 ]); COEX<vtype>(v[2 ], v[6 ]); /* step 4 */
401+ COEX<vtype>(v[3 ], v[5 ]); COEX<vtype>(v[2 ], v[4 ]); /* step 5 */
402+ COEX<vtype>(v[3 ], v[4 ]); /* step 6 */
399403 // pivot will never be a nan, since there are no nan's!
400- zmm_t sort = sort_zmm_64bit<vtype>(rand_vec );
404+ zmm_t sort = sort_zmm_64bit<vtype>(v[ 3 ] );
401405 return ((type_t *)&sort)[4 ];
402406}
403407
404- #endif
408+ #endif
0 commit comments