From ffa6fb4fc18ae047b8a8e91dbab83f8b3da5ce52 Mon Sep 17 00:00:00 2001 From: "Wesierski, Lukasz" Date: Thu, 27 Jun 2024 20:24:27 +0000 Subject: [PATCH] Fix for reduction For some work-group sizes the current implementation of reduction is not working well. The implementation in repo assumes that all of work-items will be executed. Even if the workgroup size isn't the multiple of the width of SIMD size. This change returns the SLM+barrier for the final calculation of the reduction (performance degradation). TODO: Remove the SLM+barrier and force execution of the whole reduction built-in function with NoMask on asm level. (cherry picked from commit c147d7f520f2a021663d913f69438bbe0366c464) --- IGC/BiFModule/Implementation/group.cl | 58 +++++++++++++++------------ 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/IGC/BiFModule/Implementation/group.cl b/IGC/BiFModule/Implementation/group.cl index 1e64eded4cad..49efcf1034d6 100644 --- a/IGC/BiFModule/Implementation/group.cl +++ b/IGC/BiFModule/Implementation/group.cl @@ -2484,33 +2484,41 @@ type __builtin_IB_WorkGroupReduce_##func##_##type_abbr(type X) } \ SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(Workgroup, 0, AcquireRelease | WorkgroupMemory); \ \ - type low_data; \ - type high_data; \ - type reduce; \ - if (sg_size == 32) /* SIMD32 */ \ + if(sg_id == 0) \ { \ - low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \ - high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \ - /* 64 (from 64) elements reduces to 32 */ \ - reduce = op(low_data, high_data); \ - } \ - else if(sg_size == 16) /* SIMD16 */ \ - { \ - low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \ - type mid_low_data = sg_lid + 16 < values_num ? scratch[sg_lid + 16] : identity; \ - type mid_high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \ - high_data = sg_lid + 32 + 16 < values_num ? scratch[sg_lid + 32 + 16] : identity; \ - /* 32 first part (from 64) elements reduces to 16 */ \ - low_data = op(low_data, mid_low_data); \ - /* 32 second part (from 64) elements reduces to 16 */ \ - high_data = op(mid_high_data, high_data); \ - /* 64 (from 64) elements reduces to 16 */ \ - reduce = op(low_data, high_data); \ - } \ - /* SIMD8 is not available on PVC */ \ + type low_data; \ + type high_data; \ + type reduce; \ \ - sg_x = SPIRV_BUILTIN(Group##func, _i32_i32_##type_abbr, )(Subgroup, GroupOperationReduce, reduce); \ - return sg_x; \ + if (sg_size == 32) /* SIMD32 */ \ + { \ + low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \ + high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \ + /* 64 (from 64) elements reduces to 32 */ \ + reduce = op(low_data, high_data); \ + } \ + else if(sg_size == 16) /* SIMD16 */ \ + { \ + low_data = sg_lid < values_num ? scratch[sg_lid] : identity; \ + type mid_low_data = sg_lid + 16 < values_num ? scratch[sg_lid + 16] : identity; \ + type mid_high_data = sg_lid + 32 < values_num ? scratch[sg_lid + 32] : identity; \ + high_data = sg_lid + 32 + 16 < values_num ? scratch[sg_lid + 32 + 16] : identity; \ + /* 32 first part (from 64) elements reduces to 16 */ \ + low_data = op(low_data, mid_low_data); \ + /* 32 second part (from 64) elements reduces to 16 */ \ + high_data = op(mid_high_data, high_data); \ + /* 64 (from 64) elements reduces to 16 */ \ + reduce = op(low_data, high_data); \ + } \ + /* SIMD8 is not available on PVC */ \ + \ + sg_x = SPIRV_BUILTIN(Group##func, _i32_i32_##type_abbr, )(Subgroup, GroupOperationReduce, reduce); \ + if (sg_lid == 0) { \ + scratch[0] = sg_x; \ + } \ + } \ + SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(Workgroup, 0, AcquireRelease | WorkgroupMemory); \ + return scratch[0]; \ } \ } \ else \