Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 103 additions & 64 deletions servers/rendering/renderer_rd/shaders/effects/copy.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#VERSION_DEFINES

#extension GL_KHR_shader_subgroup_ballot : enable

#include "../oct_inc.glsl"

layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
Expand Down Expand Up @@ -95,94 +97,131 @@ void main() {

#ifdef MODE_GAUSSIAN_BLUR

#ifdef MODE_GLOW
const vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0);
#endif

const uint num_subgroups = gl_NumSubgroups;
const uint subgroup_size = (gl_WorkGroupSize.x * gl_WorkGroupSize.y) / num_subgroups;

// First pass copy texture into 16x16 local memory for every 8x8 thread block
vec2 quad_center_uv = clamp(vec2(params.section.xy + gl_GlobalInvocationID.xy + gl_LocalInvocationID.xy - 3.5) / params.section.zw, vec2(0.5 / params.section.zw), vec2(1.0 - 1.5 / params.section.zw));
uint dest_index = gl_LocalInvocationID.x * 2 + gl_LocalInvocationID.y * 2 * 16;

local_cache[dest_index] = textureLod(source_color, quad_center_uv, 0);
local_cache[dest_index + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.z, 0.0), 0);
local_cache[dest_index + 16] = textureLod(source_color, quad_center_uv + vec2(0.0, 1.0 / params.section.w), 0);
local_cache[dest_index + 16 + 1] = textureLod(source_color, quad_center_uv + vec2(1.0 / params.section.zw), 0);
// To avoid bank conflicts, linear index "i" in the 16x16 grid will be placed at
// i_write according to the equation:
// i_write = i ^ ((i & shuffle_mask) >> 1)

// Compute optimal shuffle mask for the number of subgroups
const uint shuffle_mask = (0x70u / num_subgroups) & 0x70u;

const uvec2 group_top_left = gl_WorkGroupID.xy * gl_WorkGroupSize.xy;
const uint linear_write_offset = gl_SubgroupInvocationID + gl_SubgroupID * ((16u * 16u) / num_subgroups);

// Each subgroup fetches contiguous memory in the 16x16 block
#pragma unroll 4u
for (uint b = 0u; b < 4u; b++) {
// Compute the linear offset of the work item
const uint linear_index = linear_write_offset + (b * subgroup_size);
// Extract (x,y) coordinate of sub block
const uint xi = linear_index & 0xfu;
const uint yi = linear_index >> 4u;
// Fetch pixel value
const vec2 fetch_uv = clamp(
vec2(params.section.xy + group_top_left + vec2(xi, yi) - 3.5) / params.section.zw,
vec2(0.5 / params.section.zw), vec2(1.0 - 0.5 / params.section.zw));

// Shuffle write index to avoid bank conflicts during horizontal blur pass
const uint store_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
vec4 color = textureLod(source_color, fetch_uv, 0.);

#ifdef MODE_GLOW
if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) {
// Tonemap initial samples to reduce weight of fireflies: https://graphicrants.blogspot.com/2013/12/tone-mapping.html
vec3 tonemap_col = vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0);
local_cache[dest_index] /= 1.0 + dot(local_cache[dest_index].rgb, tonemap_col);
local_cache[dest_index + 1] /= 1.0 + dot(local_cache[dest_index + 1].rgb, tonemap_col);
local_cache[dest_index + 16] /= 1.0 + dot(local_cache[dest_index + 16].rgb, tonemap_col);
local_cache[dest_index + 16 + 1] /= 1.0 + dot(local_cache[dest_index + 16 + 1].rgb, tonemap_col);
color = bool(params.flags & FLAG_GLOW_FIRST_PASS) ? color / (1.0 + dot(color.rgb, tonemap_col)) : color;
#endif // MODE_GLOW

// Store in shuffled index
local_cache[store_index] = color;
}
const float kernel[5] = { 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 };

#ifdef MODE_GLOW
#define KERNEL_LENGTH 9u
const uint kernel_offset = 0u;
const float kernel[9] = { 0.0285, 0.0672, 0.1240, 0.1790, 0.2024, 0.1790, 0.1240, 0.0672, 0.0285 };
#else
// Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect.
const float kernel[4] = { 0.214607, 0.189879, 0.131514, 0.071303 };
// Simpler blur uses SIGMA2 for the gaussian kernel for a stronger effect.
#define KERNEL_LENGTH 7u
const uint kernel_offset = 1u;
const float kernel[7] = { 0.071303, 0.131514, 0.189879, 0.214607, 0.189879, 0.131514, 0.071303 };
#endif
memoryBarrierShared();
barrier();

// Horizontal pass. Needs to copy into 8x16 chunk of local memory so vertical pass has full resolution
uint read_index = gl_LocalInvocationID.x + gl_LocalInvocationID.y * 32 + 4;
vec4 color_top = vec4(0.0);
color_top += local_cache[read_index] * kernel[0];
color_top += local_cache[read_index + 1] * kernel[1];
color_top += local_cache[read_index + 2] * kernel[2];
color_top += local_cache[read_index + 3] * kernel[3];
color_top += local_cache[read_index - 1] * kernel[1];
color_top += local_cache[read_index - 2] * kernel[2];
color_top += local_cache[read_index - 3] * kernel[3];
#ifdef MODE_GLOW
color_top += local_cache[read_index + 4] * kernel[4];
color_top += local_cache[read_index - 4] * kernel[4];
#endif // MODE_GLOW

vec4 color_bottom = vec4(0.0);
color_bottom += local_cache[read_index + 16] * kernel[0];
color_bottom += local_cache[read_index + 1 + 16] * kernel[1];
color_bottom += local_cache[read_index + 2 + 16] * kernel[2];
color_bottom += local_cache[read_index + 3 + 16] * kernel[3];
color_bottom += local_cache[read_index - 1 + 16] * kernel[1];
color_bottom += local_cache[read_index - 2 + 16] * kernel[2];
color_bottom += local_cache[read_index - 3 + 16] * kernel[3];
#ifdef MODE_GLOW
color_bottom += local_cache[read_index + 4 + 16] * kernel[4];
color_bottom += local_cache[read_index - 4 + 16] * kernel[4];
#endif // MODE_GLOW
// Only need to wait on horizontal pass if subgroups fetch less than 2 rows
if (subgroup_size < 8u) {
barrier();
} else {
subgroupBarrier();
}

// Linear index of first computed element in output 16x8 temp_cache (all kernels start on "left")
const uint linear_start_0 = gl_SubgroupInvocationID + gl_SubgroupID * (2u * subgroup_size);

// rotate samples to take advantage of cache coherency
uint write_index = gl_LocalInvocationID.y * 2 + gl_LocalInvocationID.x * 16;
vec4 color_0 = vec4(0.);
// Compute corresponding 16x8 position in the 16x16 local_cache by promoting index at 8-bit
const uint start_0 = ((linear_start_0 & 0xf8u) << 1u) + (linear_start_0 & 0x7u) + kernel_offset;

temp_cache[write_index] = color_top;
temp_cache[write_index + 1] = color_bottom;
#pragma unroll KERNEL_LENGTH
for (uint k = 0u; k < KERNEL_LENGTH; k++) {
const uint linear_index = start_0 + k;
// Shuffle linear index to get stored location
const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
// Accumulate horizontal pass
color_0 += local_cache[read_index] * kernel[k];
}

// Stride by subgroup size
const uint linear_start_1 = linear_start_0 + subgroup_size;
vec4 color_1 = vec4(0.);
// Promote 8-bit for second pass
const uint start_1 = ((linear_start_1 & 0xf8u) << 1u) + (linear_start_1 & 0x7u) + kernel_offset;

#pragma unroll KERNEL_LENGTH
for (uint k = 0u; k < KERNEL_LENGTH; k++) {
const uint linear_index = start_1 + k;
// Shuffle linear index to get stored location
const uint read_index = linear_index ^ ((linear_index & shuffle_mask) >> 1u);
// Accumulate second horizontal pass
color_1 += local_cache[read_index] * kernel[k];
}

memoryBarrierShared();
barrier();
// Store values at linear 16x8 position
// Memory is stored and fetched contiguously within subgroups, no risk of bank conflicts
temp_cache[linear_start_0] = color_0;
temp_cache[linear_start_1] = color_1;

// Only need to wait on vertical pass if more than 1 subgroup is present
if (num_subgroups > 1u) {
barrier();
} else {
subgroupBarrier();
}

// If destination outside of texture, can stop doing work now
if (any(greaterThanEqual(pos, params.section.zw))) {
return;
}

// Vertical pass
uint index = gl_LocalInvocationID.y + gl_LocalInvocationID.x * 16 + 4;
// Vertical pass memory is already contiguous
const uint result_start_index = gl_LocalInvocationID.x + (gl_LocalInvocationID.y + kernel_offset) * 8u;
vec4 color = vec4(0.0);

color += temp_cache[index] * kernel[0];
color += temp_cache[index + 1] * kernel[1];
color += temp_cache[index + 2] * kernel[2];
color += temp_cache[index + 3] * kernel[3];
color += temp_cache[index - 1] * kernel[1];
color += temp_cache[index - 2] * kernel[2];
color += temp_cache[index - 3] * kernel[3];
#ifdef MODE_GLOW
color += temp_cache[index + 4] * kernel[4];
color += temp_cache[index - 4] * kernel[4];
#endif // MODE_GLOW
// Compute the vertical pass for the 16x8 elements
#pragma unroll KERNEL_LENGTH
for (uint k = 0; k < KERNEL_LENGTH; k++) {
color += temp_cache[result_start_index + 8u * k] * kernel[k];
}

#ifdef MODE_GLOW
if (bool(params.flags & FLAG_GLOW_FIRST_PASS)) {
// Undo tonemap to restore range: https://graphicrants.blogspot.com/2013/12/tone-mapping.html
color /= 1.0 - dot(color.rgb, vec3(0.299, 0.587, 0.114) / max(params.glow_luminance_cap, 6.0));
color /= 1.0 - dot(color.rgb, tonemap_col);
}

color *= params.glow_strength;
Expand Down
Loading