From d372e048a45e80d47bf6058f0ebbd974123b95f8 Mon Sep 17 00:00:00 2001 From: Soft Lattice Date: Wed, 11 Mar 2026 17:34:35 -0400 Subject: [PATCH] Optimized copy shader bank read/writes --- .../renderer_rd/shaders/effects/copy.glsl | 167 +++++++++++------- 1 file changed, 103 insertions(+), 64 deletions(-) diff --git a/servers/rendering/renderer_rd/shaders/effects/copy.glsl b/servers/rendering/renderer_rd/shaders/effects/copy.glsl index e831b12cd7af..94e71d918933 100644 --- a/servers/rendering/renderer_rd/shaders/effects/copy.glsl +++ b/servers/rendering/renderer_rd/shaders/effects/copy.glsl @@ -4,6 +4,8 @@ #VERSION_DEFINES +#extension GL_KHR_shader_subgroup_ballot : enable + #include "../oct_inc.glsl" layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; @@ -95,94 +97,131 @@ void main() { #ifdef MODE_GAUSSIAN_BLUR +#ifdef MODE_GLOW + const vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0); +#endif + + const uint num_subgroups = gl_NumSubgroups; + const uint subgroup_size = (gl_WorkGroupSize.x * gl_WorkGroupSize.y) / num_subgroups; + // First pass copy texture into 16x16 local memory for every 8x8 thread block - vec2 quad_center_uv = clamp(vec2(params.section.xy + gl_GlobalInvocationID.xy + gl_LocalInvocationID.xy - 3.5) / params.section.zw, vec2(0.5 / params.section.zw), vec2(1.0 - 1.5 / params.section.zw)); - uint dest_index = gl_LocalInvocationID.x * 2 + gl_LocalInvocationID.y * 2 * 16; - local_cache[dest_index] = textureLod(source_color, quad_center_uv, 0); - local_cache[dest_index + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.z, 0.0), 0); - local_cache[dest_index + 16] = textureLod(source_color, quad_center_uv + vec2(0.0, 1.0 / params.section.w), 0); - local_cache[dest_index + 16 + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.zw), 0); + // To avoid bank conflicts, linear index "i" in the 16x16 grid will be placed at + // i_write according to the equation: + // i_write = i ^ ((i & shuffle_mask) >> 1) + + // Compute optimal shuffle mask for the number of subgroups + const uint shuffle_mask = (0x70u / num_subgroups) & 0x70u; + + const uvec2 group_top_left = gl_WorkGroupID.xy * gl_WorkGroupSize.xy; + const uint linear_write_offset = gl_SubgroupInvocationID + gl_SubgroupID * ((16u * 16u) / num_subgroups); + +// Each subgroup fetches contiguous memory in the 16x16 block +#pragma unroll 4u + for (uint b = 0u; b < 4u; b++) { + // Compute the linear offset of the work item + const uint linear_index = linear_write_offset + (b * subgroup_size); + // Extract (x,y) coordinate of sub block + const uint xi = linear_index & 0xfu; + const uint yi = linear_index >> 4u; + // Fetch pixel value + const vec2 fetch_uv = clamp( + vec2(params.section.xy + group_top_left + vec2(xi, yi) - 3.5) / params.section.zw, + vec2(0.5 / params.section.zw), vec2(1.0 - 0.5 / params.section.zw)); + + // Shuffle write index to avoid bank conflicts during horizontal blur pass + const uint store_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u); + vec4 color = textureLod(source_color, fetch_uv, 0.); #ifdef MODE_GLOW - if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) { // Tonemap initial samples to reduce weight of fireflies: https://graphicrants.blogspot.com/2013/12/tone-mapping.html - vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0); - local_cache[dest_index] /= 1.0 + dot(local_cache[dest_index].rgb, tonemap_col); - local_cache[dest_index + 1] /= 1.0 + dot(local_cache[dest_index + 1].rgb, tonemap_col); - local_cache[dest_index + 16] /= 1.0 + dot(local_cache[dest_index + 16].rgb, tonemap_col); - local_cache[dest_index + 16 + 1] /= 1.0 + dot(local_cache[dest_index + 16 + 1].rgb, tonemap_col); + color = bool(params.flags & FLAG_GLOW_FIRST_PASS) ? color / (1.0 + dot(color.rgb, tonemap_col)) : color; +#endif // MODE_GLOW + + // Store in shuffled index + local_cache[store_index] = color; } - const float kernel[5] = { 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 }; + +#ifdef MODE_GLOW +#define KERNEL_LENGTH 9u + const uint kernel_offset = 0u; + const float kernel[9] = { 0.0285, 0.0672, 0.1240, 0.1790, 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 }; #else - // Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect. - const float kernel[4] = { 0.214607, 0.189879, 0.131514, 0.071303 }; +// Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect. +#define KERNEL_LENGTH 7u + const uint kernel_offset = 1u; + const float kernel[7] = { 0.071303, 0.131514, 0.189879, 0.214607, 0.189879, 0.131514, 0.071303 }; #endif - memoryBarrierShared(); - barrier(); - - // Horizontal pass. Needs to copy into 8x16 chunk of local memory so vertical pass has full resolution - uint read_index = gl_LocalInvocationID.x + gl_LocalInvocationID.y * 32 + 4; - vec4 color_top = vec4(0.0); - color_top += local_cache[read_index] * kernel[0]; - color_top += local_cache[read_index + 1] * kernel[1]; - color_top += local_cache[read_index + 2] * kernel[2]; - color_top += local_cache[read_index + 3] * kernel[3]; - color_top += local_cache[read_index - 1] * kernel[1]; - color_top += local_cache[read_index - 2] * kernel[2]; - color_top += local_cache[read_index - 3] * kernel[3]; -#ifdef MODE_GLOW - color_top += local_cache[read_index + 4] * kernel[4]; - color_top += local_cache[read_index - 4] * kernel[4]; -#endif // MODE_GLOW - vec4 color_bottom = vec4(0.0); - color_bottom += local_cache[read_index + 16] * kernel[0]; - color_bottom += local_cache[read_index + 1 + 16] * kernel[1]; - color_bottom += local_cache[read_index + 2 + 16] * kernel[2]; - color_bottom += local_cache[read_index + 3 + 16] * kernel[3]; - color_bottom += local_cache[read_index - 1 + 16] * kernel[1]; - color_bottom += local_cache[read_index - 2 + 16] * kernel[2]; - color_bottom += local_cache[read_index - 3 + 16] * kernel[3]; -#ifdef MODE_GLOW - color_bottom += local_cache[read_index + 4 + 16] * kernel[4]; - color_bottom += local_cache[read_index - 4 + 16] * kernel[4]; -#endif // MODE_GLOW + // Only need to wait on horizontal pass if subgroups fetch less than 2 rows + if (subgroup_size < 8u) { + barrier(); + } else { + subgroupBarrier(); + } + + // Linear index of first computed element in output 16x8 temp_cache (all kernels start on "left") + const uint linear_start_0 = gl_SubgroupInvocationID + gl_SubgroupID * (2u * subgroup_size); - // rotate samples to take advantage of cache coherency - uint write_index = gl_LocalInvocationID.y * 2 + gl_LocalInvocationID.x * 16; + vec4 color_0 = vec4(0.); + // Compute corresponding 16x8 position in the 16x16 local_cache by promoting index at 8-bit + const uint start_0 = ((linear_start_0 & 0xf8u) << 1u) + (linear_start_0 & 0x7u) + kernel_offset; - temp_cache[write_index] = color_top; - temp_cache[write_index + 1] = color_bottom; +#pragma unroll KERNEL_LENGTH + for (uint k = 0u; k < KERNEL_LENGTH; k++) { + const uint linear_index = start_0 + k; + // Shuffle linear index to get stored location + const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u); + // Accumulate horizontal pass + color_0 += local_cache[read_index] * kernel[k]; + } + + // Stride by subgroup size + const uint linear_start_1 = linear_start_0 + subgroup_size; + vec4 color_1 = vec4(0.); + // Promote 8-bit for second pass + const uint start_1 = ((linear_start_1 & 0xf8u) << 1u) + (linear_start_1 & 0x7u) + kernel_offset; + +#pragma unroll KERNEL_LENGTH + for (uint k = 0u; k < KERNEL_LENGTH; k++) { + const uint linear_index = start_1 + k; + // Shuffle linear index to get stored location + const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u); + // Accumulate second horizontal pass + color_1 += local_cache[read_index] * kernel[k]; + } - memoryBarrierShared(); - barrier(); + // Store values at linear 16x8 position + // Memory is stored and fetched contiguously within subgroups, no risk of bank conflicts + temp_cache[linear_start_0] = color_0; + temp_cache[linear_start_1] = color_1; + + // Only need to wait on vertical pass if more than 1 subgroup is present + if (num_subgroups > 1u) { + barrier(); + } else { + subgroupBarrier(); + } // If destination outside of texture, can stop doing work now if (any(greaterThanEqual(pos, params.section.zw))) { return; } - // Vertical pass - uint index = gl_LocalInvocationID.y + gl_LocalInvocationID.x * 16 + 4; + // Vertical pass memory is already contiguous + const uint result_start_index = gl_LocalInvocationID.x + (gl_LocalInvocationID.y + kernel_offset) * 8u; vec4 color = vec4(0.0); - color += temp_cache[index] * kernel[0]; - color += temp_cache[index + 1] * kernel[1]; - color += temp_cache[index + 2] * kernel[2]; - color += temp_cache[index + 3] * kernel[3]; - color += temp_cache[index - 1] * kernel[1]; - color += temp_cache[index - 2] * kernel[2]; - color += temp_cache[index - 3] * kernel[3]; -#ifdef MODE_GLOW - color += temp_cache[index + 4] * kernel[4]; - color += temp_cache[index - 4] * kernel[4]; -#endif // MODE_GLOW +// Compute the vertical pass for the 16x8 elements +#pragma unroll KERNEL_LENGTH + for (uint k = 0; k < KERNEL_LENGTH; k++) { + color += temp_cache[result_start_index + 8u * k] * kernel[k]; + } #ifdef MODE_GLOW if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) { // Undo tonemap to restore range: https://graphicrants.blogspot.com/2013/12/tone-mapping.html - color /= 1.0 - dot(color.rgb, vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0)); + color /= 1.0 - dot(color.rgb, tonemap_col); } color *= params.glow_strength;