@@ -269,11 +269,13 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
269269 type_t *biggest)
270270{
271271 const int num_unroll = 8 ;
272- if (right - left <= 2 *num_unroll*vtype::numlanes) {
273- return partition_avx512<vtype>(arr, left, right, pivot, smallest, biggest);
272+ if (right - left <= 2 * num_unroll * vtype::numlanes) {
273+ return partition_avx512<vtype>(
274+ arr, left, right, pivot, smallest, biggest);
274275 }
275276 /* make array length divisible by 8*vtype::numlanes , shortening the array */
276- for (int32_t i = ((right - left) % (num_unroll*vtype::numlanes)); i > 0 ; --i) {
277+ for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0 ;
278+ --i) {
277279 *smallest = std::min (*smallest, arr[left], comparison_func<vtype>);
278280 *biggest = std::max (*biggest, arr[left], comparison_func<vtype>);
279281 if (!comparison_func<vtype>(arr[left], pivot)) {
@@ -295,17 +297,18 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
295297 // We will now have atleast 16 registers worth of data to process:
296298 // left and right vtype::numlanes values are partitioned at the end
297299 zmm_t vec_left[num_unroll], vec_right[num_unroll];
298- #pragma GCC unroll 8
300+ #pragma GCC unroll 8
299301 for (int ii = 0 ; ii < num_unroll; ++ii) {
300- vec_left[ii] = vtype::loadu (arr + left + vtype::numlanes*ii);
301- vec_right[ii] = vtype::loadu (arr + (right - vtype::numlanes*(num_unroll-ii)));
302+ vec_left[ii] = vtype::loadu (arr + left + vtype::numlanes * ii);
303+ vec_right[ii] = vtype::loadu (
304+ arr + (right - vtype::numlanes * (num_unroll - ii)));
302305 }
303306 // store points of the vectors
304307 int64_t r_store = right - vtype::numlanes;
305308 int64_t l_store = left;
306309 // indices for loading the elements
307- left += num_unroll* vtype::numlanes;
308- right -= num_unroll* vtype::numlanes;
310+ left += num_unroll * vtype::numlanes;
311+ right -= num_unroll * vtype::numlanes;
309312 while (right - left != 0 ) {
310313 zmm_t curr_vec[num_unroll];
311314 /*
@@ -314,57 +317,59 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
314317 * otherwise from the left side
315318 */
316319 if ((r_store + vtype::numlanes) - right < left - l_store) {
317- right -= num_unroll* vtype::numlanes;
318- #pragma GCC unroll 8
320+ right -= num_unroll * vtype::numlanes;
321+ #pragma GCC unroll 8
319322 for (int ii = 0 ; ii < num_unroll; ++ii) {
320- curr_vec[ii] = vtype::loadu (arr + right + ii* vtype::numlanes);
323+ curr_vec[ii] = vtype::loadu (arr + right + ii * vtype::numlanes);
321324 }
322325 }
323326 else {
324- #pragma GCC unroll 8
327+ #pragma GCC unroll 8
325328 for (int ii = 0 ; ii < num_unroll; ++ii) {
326- curr_vec[ii] = vtype::loadu (arr + left + ii* vtype::numlanes);
329+ curr_vec[ii] = vtype::loadu (arr + left + ii * vtype::numlanes);
327330 }
328- left += num_unroll* vtype::numlanes;
331+ left += num_unroll * vtype::numlanes;
329332 }
330- // partition the current vector and save it on both sides of the array
331- #pragma GCC unroll 8
333+ // partition the current vector and save it on both sides of the array
334+ #pragma GCC unroll 8
332335 for (int ii = 0 ; ii < num_unroll; ++ii) {
333336 int32_t amount_ge_pivot
334337 = partition_vec<vtype>(arr,
335338 l_store,
336339 r_store + vtype::numlanes,
337340 curr_vec[ii],
338341 pivot_vec,
339- &min_vec,pick
342+ &min_vec,
340343 &max_vec);
341344 l_store += (vtype::numlanes - amount_ge_pivot);
342345 r_store -= amount_ge_pivot;
343346 }
344347 }
345348
346- /* partition and save vec_left[8] and vec_right[8] */
347- #pragma GCC unroll 8
349+ /* partition and save vec_left[8] and vec_right[8] */
350+ #pragma GCC unroll 8
348351 for (int ii = 0 ; ii < num_unroll; ++ii) {
349- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
350- l_store,
351- r_store + vtype::numlanes,
352- vec_left[ii],
353- pivot_vec,
354- &min_vec,
355- &max_vec);
352+ int32_t amount_ge_pivot
353+ = partition_vec<vtype>(arr,
354+ l_store,
355+ r_store + vtype::numlanes,
356+ vec_left[ii],
357+ pivot_vec,
358+ &min_vec,
359+ &max_vec);
356360 l_store += (vtype::numlanes - amount_ge_pivot);
357361 r_store -= amount_ge_pivot;
358362 }
359- #pragma GCC unroll 8
363+ #pragma GCC unroll 8
360364 for (int ii = 0 ; ii < num_unroll; ++ii) {
361- int32_t amount_ge_pivot = partition_vec<vtype>(arr,
362- l_store,
363- r_store + vtype::numlanes,
364- vec_right[ii],
365- pivot_vec,
366- &min_vec,
367- &max_vec);
365+ int32_t amount_ge_pivot
366+ = partition_vec<vtype>(arr,
367+ l_store,
368+ r_store + vtype::numlanes,
369+ vec_right[ii],
370+ pivot_vec,
371+ &min_vec,
372+ &max_vec);
368373 l_store += (vtype::numlanes - amount_ge_pivot);
369374 r_store -= amount_ge_pivot;
370375 }
0 commit comments