From d372e048a45e80d47bf6058f0ebbd974123b95f8 Mon Sep 17 00:00:00 2001
From: Soft Lattice <softlatticegames@gmail.com>
Date: Wed, 11 Mar 2026 17:34:35 -0400
Subject: [PATCH] Optimized copy shader bank read/writes

---
 .../renderer_rd/shaders/effects/copy.glsl     | 167 +++++++++++-------
 1 file changed, 103 insertions(+), 64 deletions(-)

diff --git a/servers/rendering/renderer_rd/shaders/effects/copy.glsl b/servers/rendering/renderer_rd/shaders/effects/copy.glsl
index e831b12cd7af..94e71d918933 100644
--- a/servers/rendering/renderer_rd/shaders/effects/copy.glsl
+++ b/servers/rendering/renderer_rd/shaders/effects/copy.glsl
@@ -4,6 +4,8 @@
 
 #VERSION_DEFINES
 
+#extension GL_KHR_shader_subgroup_ballot : enable
+
 #include "../oct_inc.glsl"
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
@@ -95,94 +97,131 @@ void main() {
 
 #ifdef MODE_GAUSSIAN_BLUR
 
+#ifdef MODE_GLOW
+	const vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0);
+#endif
+
+	const uint num_subgroups = gl_NumSubgroups;
+	const uint subgroup_size = (gl_WorkGroupSize.x * gl_WorkGroupSize.y) / num_subgroups;
+
 	// First pass copy texture into 16x16 local memory for every 8x8 thread block
-	vec2 quad_center_uv = clamp(vec2(params.section.xy + gl_GlobalInvocationID.xy + gl_LocalInvocationID.xy - 3.5) / params.section.zw, vec2(0.5 / params.section.zw), vec2(1.0 - 1.5 / params.section.zw));
-	uint dest_index = gl_LocalInvocationID.x * 2 + gl_LocalInvocationID.y * 2 * 16;
 
-	local_cache[dest_index] = textureLod(source_color, quad_center_uv, 0);
-	local_cache[dest_index + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.z, 0.0), 0);
-	local_cache[dest_index + 16] = textureLod(source_color, quad_center_uv + vec2(0.0, 1.0 / params.section.w), 0);
-	local_cache[dest_index + 16 + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.zw), 0);
+	// To avoid bank conflicts, linear index "i" in the 16x16 grid will be placed at
+	// i_write according to the equation:
+	// i_write = i ^ ((i & shuffle_mask) >> 1)
+
+	// Compute optimal shuffle mask for the number of subgroups
+	const uint shuffle_mask = (0x70u / num_subgroups) & 0x70u;
+
+	const uvec2 group_top_left = gl_WorkGroupID.xy * gl_WorkGroupSize.xy;
+	const uint linear_write_offset = gl_SubgroupInvocationID + gl_SubgroupID * ((16u * 16u) / num_subgroups);
+
+// Each subgroup fetches contiguous memory in the 16x16 block
+#pragma unroll 4u
+	for (uint b = 0u; b < 4u; b++) {
+		// Compute the linear offset of the work item
+		const uint linear_index = linear_write_offset + (b * subgroup_size);
+		// Extract (x,y) coordinate of sub block
+		const uint xi = linear_index & 0xfu;
+		const uint yi = linear_index >> 4u;
+		// Fetch pixel value
+		const vec2 fetch_uv = clamp(
+				vec2(params.section.xy + group_top_left + vec2(xi, yi) - 3.5) / params.section.zw,
+				vec2(0.5 / params.section.zw), vec2(1.0 - 0.5 / params.section.zw));
+
+		// Shuffle write index to avoid bank conflicts during horizontal blur pass
+		const uint store_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
+		vec4 color = textureLod(source_color, fetch_uv, 0.);
 
 #ifdef MODE_GLOW
-	if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) {
 		// Tonemap initial samples to reduce weight of fireflies: https://graphicrants.blogspot.com/2013/12/tone-mapping.html
-		vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0);
-		local_cache[dest_index] /= 1.0 + dot(local_cache[dest_index].rgb, tonemap_col);
-		local_cache[dest_index + 1] /= 1.0 + dot(local_cache[dest_index + 1].rgb, tonemap_col);
-		local_cache[dest_index + 16] /= 1.0 + dot(local_cache[dest_index + 16].rgb, tonemap_col);
-		local_cache[dest_index + 16 + 1] /= 1.0 + dot(local_cache[dest_index + 16 + 1].rgb, tonemap_col);
+		color = bool(params.flags & FLAG_GLOW_FIRST_PASS) ? color / (1.0 + dot(color.rgb, tonemap_col)) : color;
+#endif // MODE_GLOW
+
+		// Store in shuffled index
+		local_cache[store_index] = color;
 	}
-	const float kernel[5] = { 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 };
+
+#ifdef MODE_GLOW
+#define KERNEL_LENGTH 9u
+	const uint kernel_offset = 0u;
+	const float kernel[9] = { 0.0285, 0.0672, 0.1240, 0.1790, 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 };
 #else
-	// Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect.
-	const float kernel[4] = { 0.214607, 0.189879, 0.131514, 0.071303 };
+// Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect.
+#define KERNEL_LENGTH 7u
+	const uint kernel_offset = 1u;
+	const float kernel[7] = { 0.071303, 0.131514, 0.189879, 0.214607, 0.189879, 0.131514, 0.071303 };
 #endif
-	memoryBarrierShared();
-	barrier();
-
-	// Horizontal pass. Needs to copy into 8x16 chunk of local memory so vertical pass has full resolution
-	uint read_index = gl_LocalInvocationID.x + gl_LocalInvocationID.y * 32 + 4;
-	vec4 color_top = vec4(0.0);
-	color_top += local_cache[read_index] * kernel[0];
-	color_top += local_cache[read_index + 1] * kernel[1];
-	color_top += local_cache[read_index + 2] * kernel[2];
-	color_top += local_cache[read_index + 3] * kernel[3];
-	color_top += local_cache[read_index - 1] * kernel[1];
-	color_top += local_cache[read_index - 2] * kernel[2];
-	color_top += local_cache[read_index - 3] * kernel[3];
-#ifdef MODE_GLOW
-	color_top += local_cache[read_index + 4] * kernel[4];
-	color_top += local_cache[read_index - 4] * kernel[4];
-#endif // MODE_GLOW
 
-	vec4 color_bottom = vec4(0.0);
-	color_bottom += local_cache[read_index + 16] * kernel[0];
-	color_bottom += local_cache[read_index + 1 + 16] * kernel[1];
-	color_bottom += local_cache[read_index + 2 + 16] * kernel[2];
-	color_bottom += local_cache[read_index + 3 + 16] * kernel[3];
-	color_bottom += local_cache[read_index - 1 + 16] * kernel[1];
-	color_bottom += local_cache[read_index - 2 + 16] * kernel[2];
-	color_bottom += local_cache[read_index - 3 + 16] * kernel[3];
-#ifdef MODE_GLOW
-	color_bottom += local_cache[read_index + 4 + 16] * kernel[4];
-	color_bottom += local_cache[read_index - 4 + 16] * kernel[4];
-#endif // MODE_GLOW
+	// Only need to wait on horizontal pass if subgroups fetch less than 2 rows
+	if (subgroup_size < 8u) {
+		barrier();
+	} else {
+		subgroupBarrier();
+	}
+
+	// Linear index of first computed element in output 16x8 temp_cache (all kernels start on "left")
+	const uint linear_start_0 = gl_SubgroupInvocationID + gl_SubgroupID * (2u * subgroup_size);
 
-	// rotate samples to take advantage of cache coherency
-	uint write_index = gl_LocalInvocationID.y * 2 + gl_LocalInvocationID.x * 16;
+	vec4 color_0 = vec4(0.);
+	// Compute corresponding 16x8 position in the 16x16 local_cache by promoting index at 8-bit
+	const uint start_0 = ((linear_start_0 & 0xf8u) << 1u) + (linear_start_0 & 0x7u) + kernel_offset;
 
-	temp_cache[write_index] = color_top;
-	temp_cache[write_index + 1] = color_bottom;
+#pragma unroll KERNEL_LENGTH
+	for (uint k = 0u; k < KERNEL_LENGTH; k++) {
+		const uint linear_index = start_0 + k;
+		// Shuffle linear index to get stored location
+		const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
+		// Accumulate horizontal pass
+		color_0 += local_cache[read_index] * kernel[k];
+	}
+
+	// Stride by subgroup size
+	const uint linear_start_1 = linear_start_0 + subgroup_size;
+	vec4 color_1 = vec4(0.);
+	// Promote 8-bit for second pass
+	const uint start_1 = ((linear_start_1 & 0xf8u) << 1u) + (linear_start_1 & 0x7u) + kernel_offset;
+
+#pragma unroll KERNEL_LENGTH
+	for (uint k = 0u; k < KERNEL_LENGTH; k++) {
+		const uint linear_index = start_1 + k;
+		// Shuffle linear index to get stored location
+		const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
+		// Accumulate second horizontal pass
+		color_1 += local_cache[read_index] * kernel[k];
+	}
 
-	memoryBarrierShared();
-	barrier();
+	// Store values at linear 16x8 position
+	// Memory is stored and fetched contiguously within subgroups, no risk of bank conflicts
+	temp_cache[linear_start_0] = color_0;
+	temp_cache[linear_start_1] = color_1;
+
+	// Only need to wait on vertical pass if more than 1 subgroup is present
+	if (num_subgroups > 1u) {
+		barrier();
+	} else {
+		subgroupBarrier();
+	}
 
 	// If destination outside of texture, can stop doing work now
 	if (any(greaterThanEqual(pos, params.section.zw))) {
 		return;
 	}
 
-	// Vertical pass
-	uint index = gl_LocalInvocationID.y + gl_LocalInvocationID.x * 16 + 4;
+	// Vertical pass memory is already contiguous
+	const uint result_start_index = gl_LocalInvocationID.x + (gl_LocalInvocationID.y + kernel_offset) * 8u;
 	vec4 color = vec4(0.0);
 
-	color += temp_cache[index] * kernel[0];
-	color += temp_cache[index + 1] * kernel[1];
-	color += temp_cache[index + 2] * kernel[2];
-	color += temp_cache[index + 3] * kernel[3];
-	color += temp_cache[index - 1] * kernel[1];
-	color += temp_cache[index - 2] * kernel[2];
-	color += temp_cache[index - 3] * kernel[3];
-#ifdef MODE_GLOW
-	color += temp_cache[index + 4] * kernel[4];
-	color += temp_cache[index - 4] * kernel[4];
-#endif // MODE_GLOW
+// Compute the vertical pass for the 16x8 elements
+#pragma unroll KERNEL_LENGTH
+	for (uint k = 0; k < KERNEL_LENGTH; k++) {
+		color += temp_cache[result_start_index + 8u * k] * kernel[k];
+	}
 
 #ifdef MODE_GLOW
 	if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) {
 		// Undo tonemap to restore range: https://graphicrants.blogspot.com/2013/12/tone-mapping.html
-		color /= 1.0 - dot(color.rgb, vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0));
+		color /= 1.0 - dot(color.rgb, tonemap_col);
 	}
 
 	color *= params.glow_strength;