From 24dbaa7db36527b84891dd629cdf00ac9cb6d62d Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:01:02 -0700 Subject: [PATCH 01/15] Add single pass scan. Remove tree reductions. --- vello/src/render.rs | 64 +-- vello/src/shaders.rs | 42 +- vello/src/wgpu_engine.rs | 9 - vello_encoding/src/config.rs | 9 +- vello_shaders/shader/pathtag_reduce.wgsl | 42 -- vello_shaders/shader/pathtag_reduce2.wgsl | 41 -- vello_shaders/shader/pathtag_scan.wgsl | 76 ---- vello_shaders/shader/pathtag_scan1.wgsl | 67 --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 403 ++++++++++++++++++ vello_shaders/shader/shared/pathtag.wgsl | 9 + vello_shaders/src/cpu.rs | 6 +- vello_shaders/src/cpu/pathtag_reduce.rs | 32 -- vello_shaders/src/cpu/pathtag_scan.rs | 37 -- vello_shaders/src/cpu/pathtag_scan_single.rs | 30 ++ 14 files changed, 471 insertions(+), 396 deletions(-) delete mode 100644 vello_shaders/shader/pathtag_reduce.wgsl delete mode 100644 vello_shaders/shader/pathtag_reduce2.wgsl delete mode 100644 vello_shaders/shader/pathtag_scan.wgsl delete mode 100644 vello_shaders/shader/pathtag_scan1.wgsl create mode 100644 vello_shaders/shader/pathtag_scan_csdldf.wgsl delete mode 100644 vello_shaders/src/cpu/pathtag_reduce.rs delete mode 100644 vello_shaders/src/cpu/pathtag_scan.rs create mode 100644 vello_shaders/src/cpu/pathtag_scan_single.rs diff --git a/vello/src/render.rs b/vello/src/render.rs index 2cca8e6b2..6bc0541dc 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -179,6 +179,7 @@ impl Render { packed.resize(size_of::(), u8::MAX); } let scene_buf = ResourceProxy::Buffer(recording.upload("scene", packed)); + let config_buf = ResourceProxy::Buffer( recording.upload_uniform("config", bytemuck::bytes_of(&cpu_config.gpu)), ); @@ -191,61 +192,30 @@ impl Render { let segments_buf = ResourceProxy::new_buf(buffer_sizes.segments.size_in_bytes().into(), "segments_buf"); let ptcl_buf = ResourceProxy::new_buf(buffer_sizes.ptcl.size_in_bytes().into(), "ptcl_buf"); - let reduced_buf = ResourceProxy::new_buf( - buffer_sizes.path_reduced.size_in_bytes().into(), - "reduced_buf", - ); - // TODO: really only need pathtag_wgs - 1 - recording.dispatch( - shaders.pathtag_reduce, - wg_counts.path_reduce, - [config_buf, scene_buf, reduced_buf], - ); - let mut pathtag_parent = reduced_buf; - let mut large_pathtag_bufs = None; - let use_large_path_scan = wg_counts.use_large_path_scan && !shaders.pathtag_is_cpu; - if use_large_path_scan { - let reduced2_buf = ResourceProxy::new_buf( - buffer_sizes.path_reduced2.size_in_bytes().into(), - "reduced2_buf", - ); - recording.dispatch( - shaders.pathtag_reduce2, - wg_counts.path_reduce2, - [reduced_buf, reduced2_buf], - ); - let reduced_scan_buf = ResourceProxy::new_buf( - buffer_sizes.path_reduced_scan.size_in_bytes().into(), - "reduced_scan_buf", - ); - recording.dispatch( - shaders.pathtag_scan1, - wg_counts.path_scan1, - [reduced_buf, reduced2_buf, reduced_scan_buf], - ); - pathtag_parent = reduced_scan_buf; - large_pathtag_bufs = Some((reduced2_buf, reduced_scan_buf)); - } - + let tagmonoid_buf = ResourceProxy::new_buf( buffer_sizes.path_monoids.size_in_bytes().into(), "tagmonoid_buf", ); - let pathtag_scan = if use_large_path_scan { - shaders.pathtag_scan_large - } else { - shaders.pathtag_scan - }; + let reduced_buf = BufferProxy::new( + buffer_sizes.path_reduced.size_in_bytes().into(), + "reduced_buf", + ); + let path_scan_bump_buf = BufferProxy::new( + buffer_sizes.path_scan_bump.size_in_bytes().into(), + "bump_buf"); + recording.clear_all(path_scan_bump_buf); + recording.clear_all(reduced_buf); + let path_scan_bump_buf = ResourceProxy::Buffer(path_scan_bump_buf); + let reduced_buf = ResourceProxy::Buffer(reduced_buf); recording.dispatch( - pathtag_scan, + shaders.pathtag_scan_csdldf, wg_counts.path_scan, - [config_buf, scene_buf, pathtag_parent, tagmonoid_buf], + [config_buf, scene_buf, reduced_buf, tagmonoid_buf, path_scan_bump_buf], ); recording.free_resource(reduced_buf); - if let Some((reduced2, reduced_scan)) = large_pathtag_bufs { - recording.free_resource(reduced2); - recording.free_resource(reduced_scan); - } + recording.free_resource(path_scan_bump_buf); + let path_bbox_buf = ResourceProxy::new_buf( buffer_sizes.path_bboxes.size_in_bytes().into(), "path_bbox_buf", diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index a58e0edfe..f4facb727 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -17,11 +17,7 @@ use crate::{ // Shaders for the full pipeline pub struct FullShaders { - pub pathtag_reduce: ShaderId, - pub pathtag_reduce2: ShaderId, - pub pathtag_scan1: ShaderId, - pub pathtag_scan: ShaderId, - pub pathtag_scan_large: ShaderId, + pub pathtag_scan_csdldf: ShaderId, pub bbox_clear: ShaderId, pub flatten: ShaderId, pub draw_reduce: ShaderId, @@ -39,9 +35,6 @@ pub struct FullShaders { pub fine_area: Option, pub fine_msaa8: Option, pub fine_msaa16: Option, - // 2-level dispatch works for CPU pathtag scan even for large - // inputs, 3-level is not yet implemented. - pub pathtag_is_cpu: bool, } #[cfg(feature = "wgpu")] @@ -101,27 +94,11 @@ pub(crate) fn full_shaders( }; } - let pathtag_reduce = add_shader!(pathtag_reduce, [Uniform, BufReadOnly, Buffer]); - let pathtag_reduce2 = add_shader!( - pathtag_reduce2, - [BufReadOnly, Buffer], - CpuShaderType::Skipped - ); - let pathtag_scan1 = add_shader!( - pathtag_scan1, - [BufReadOnly, BufReadOnly, Buffer], - CpuShaderType::Skipped - ); - let pathtag_scan = add_shader!( - pathtag_scan_small, - [Uniform, BufReadOnly, BufReadOnly, Buffer], - CpuShaderType::Present(vello_shaders::cpu::pathtag_scan) - ); - let pathtag_scan_large = add_shader!( - pathtag_scan_large, - [Uniform, BufReadOnly, BufReadOnly, Buffer], - CpuShaderType::Skipped - ); + let pathtag_scan_csdldf = add_shader!( + pathtag_scan_csdldf, + [Uniform, BufReadOnly, Buffer, Buffer, Buffer], + CpuShaderType::Present(vello_shaders::cpu::pathtag_scan_single)); + let bbox_clear = add_shader!(bbox_clear, [Uniform, Buffer]); let flatten = add_shader!( flatten, @@ -249,11 +226,7 @@ pub(crate) fn full_shaders( }; Ok(FullShaders { - pathtag_reduce, - pathtag_reduce2, - pathtag_scan, - pathtag_scan1, - pathtag_scan_large, + pathtag_scan_csdldf, bbox_clear, flatten, draw_reduce, @@ -271,6 +244,5 @@ pub(crate) fn full_shaders( fine_area, fine_msaa8, fine_msaa16, - pathtag_is_cpu: options.use_cpu, }) } diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs index b10cbac5c..509088ebe 100644 --- a/vello/src/wgpu_engine.rs +++ b/vello/src/wgpu_engine.rs @@ -56,7 +56,6 @@ struct WgpuShader { pub(crate) enum CpuShaderType { Present(fn(u32, &[CpuBinding])), Missing, - Skipped, } struct CpuShader { @@ -263,14 +262,6 @@ impl WgpuEngine { label, }); } - // This shader is unused in CPU mode, create a dummy shader - CpuShaderType::Skipped => { - return add(Shader { - wgpu: None, - cpu: None, - label, - }); - } // Create a GPU shader as we don't have a CPU shader CpuShaderType::Missing => {} } diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 88da7fd46..6851f27d5 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -334,8 +334,7 @@ impl PartialOrd for BufferSize { pub struct BufferSizes { // Known size buffers pub path_reduced: BufferSize, - pub path_reduced2: BufferSize, - pub path_reduced_scan: BufferSize, + pub path_scan_bump: BufferSize, pub path_monoids: BufferSize, pub path_bboxes: BufferSize, pub draw_reduced: BufferSize, @@ -372,8 +371,7 @@ impl BufferSizes { path_tag_wgs }; let path_reduced = BufferSize::new(reduced_size); - let path_reduced2 = BufferSize::new(PATH_REDUCE_WG); - let path_reduced_scan = BufferSize::new(reduced_size); + let path_scan_bump = BufferSize::new(1); let path_monoids = BufferSize::new(path_tag_wgs * PATH_REDUCE_WG); let path_bboxes = BufferSize::new(n_paths); let binning_wgs = workgroups.binning.0; @@ -405,8 +403,7 @@ impl BufferSizes { let ptcl = BufferSize::new(1 << 23); Self { path_reduced, - path_reduced2, - path_reduced_scan, + path_scan_bump, path_monoids, path_bboxes, draw_reduced, diff --git a/vello_shaders/shader/pathtag_reduce.wgsl b/vello_shaders/shader/pathtag_reduce.wgsl deleted file mode 100644 index ab6e10e98..000000000 --- a/vello_shaders/shader/pathtag_reduce.wgsl +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config -#import pathtag - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var reduced: array; - -let LG_WG_SIZE = 8u; -let WG_SIZE = 256u; - -var sh_scratch: array; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, -) { - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + ix]; - var agg = reduce_tag(tag_word); - sh_scratch[local_id.x] = agg; - for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { - workgroupBarrier(); - if local_id.x + (1u << i) < WG_SIZE { - let other = sh_scratch[local_id.x + (1u << i)]; - agg = combine_tag_monoid(agg, other); - } - workgroupBarrier(); - sh_scratch[local_id.x] = agg; - } - if local_id.x == 0u { - reduced[ix >> LG_WG_SIZE] = agg; - } -} diff --git a/vello_shaders/shader/pathtag_reduce2.wgsl b/vello_shaders/shader/pathtag_reduce2.wgsl deleted file mode 100644 index eb8621f0f..000000000 --- a/vello_shaders/shader/pathtag_reduce2.wgsl +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// This shader is the second stage of reduction for the pathtag -// monoid scan, needed when the number of tags is large. - -#import config -#import pathtag - -@group(0) @binding(0) -var reduced_in: array; - -@group(0) @binding(1) -var reduced: array; - -let LG_WG_SIZE = 8u; -let WG_SIZE = 256u; - -var sh_scratch: array; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, -) { - let ix = global_id.x; - var agg = reduced_in[ix]; - sh_scratch[local_id.x] = agg; - for (var i = 0u; i < firstTrailingBit(WG_SIZE); i += 1u) { - workgroupBarrier(); - if local_id.x + (1u << i) < WG_SIZE { - let other = sh_scratch[local_id.x + (1u << i)]; - agg = combine_tag_monoid(agg, other); - } - workgroupBarrier(); - sh_scratch[local_id.x] = agg; - } - if local_id.x == 0u { - reduced[ix >> LG_WG_SIZE] = agg; - } -} diff --git a/vello_shaders/shader/pathtag_scan.wgsl b/vello_shaders/shader/pathtag_scan.wgsl deleted file mode 100644 index 27a34bdf7..000000000 --- a/vello_shaders/shader/pathtag_scan.wgsl +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -#import config -#import pathtag - -@group(0) @binding(0) -var config: Config; - -@group(0) @binding(1) -var scene: array; - -@group(0) @binding(2) -var reduced: array; - -@group(0) @binding(3) -var tag_monoids: array; - -let LG_WG_SIZE = 8u; -let WG_SIZE = 256u; - -#ifdef small -var sh_parent: array; -#endif -// These could be combined? -var sh_monoid: array; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) wg_id: vec3, -) { -#ifdef small - var agg = tag_monoid_identity(); - if local_id.x < wg_id.x { - agg = reduced[local_id.x]; - } - sh_parent[local_id.x] = agg; - for (var i = 0u; i < LG_WG_SIZE; i += 1u) { - workgroupBarrier(); - if local_id.x + (1u << i) < WG_SIZE { - let other = sh_parent[local_id.x + (1u << i)]; - agg = combine_tag_monoid(agg, other); - } - workgroupBarrier(); - sh_parent[local_id.x] = agg; - } -#endif - - let ix = global_id.x; - let tag_word = scene[config.pathtag_base + ix]; - var agg_part = reduce_tag(tag_word); - sh_monoid[local_id.x] = agg_part; - for (var i = 0u; i < LG_WG_SIZE; i += 1u) { - workgroupBarrier(); - if local_id.x >= 1u << i { - let other = sh_monoid[local_id.x - (1u << i)]; - agg_part = combine_tag_monoid(other, agg_part); - } - workgroupBarrier(); - sh_monoid[local_id.x] = agg_part; - } - workgroupBarrier(); - // prefix up to this workgroup -#ifdef small - var tm = sh_parent[0]; -#else - var tm = reduced[wg_id.x]; -#endif - if local_id.x > 0u { - tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); - } - // exclusive prefix sum, granularity of 4 tag bytes - tag_monoids[ix] = tm; -} diff --git a/vello_shaders/shader/pathtag_scan1.wgsl b/vello_shaders/shader/pathtag_scan1.wgsl deleted file mode 100644 index 7f3b47659..000000000 --- a/vello_shaders/shader/pathtag_scan1.wgsl +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// This shader computes the scan of reduced tag monoids given -// two levels of reduction. - -#import config -#import pathtag - -@group(0) @binding(0) -var reduced: array; - -@group(0) @binding(1) -var reduced2: array; - -@group(0) @binding(2) -var tag_monoids: array; - -let LG_WG_SIZE = 8u; -let WG_SIZE = 256u; - -var sh_parent: array; -// These could be combined? -var sh_monoid: array; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(workgroup_id) wg_id: vec3, -) { - var agg = tag_monoid_identity(); - if local_id.x < wg_id.x { - agg = reduced2[local_id.x]; - } - sh_parent[local_id.x] = agg; - for (var i = 0u; i < LG_WG_SIZE; i += 1u) { - workgroupBarrier(); - if local_id.x + (1u << i) < WG_SIZE { - let other = sh_parent[local_id.x + (1u << i)]; - agg = combine_tag_monoid(agg, other); - } - workgroupBarrier(); - sh_parent[local_id.x] = agg; - } - - let ix = global_id.x; - agg = reduced[ix]; - sh_monoid[local_id.x] = agg; - for (var i = 0u; i < LG_WG_SIZE; i += 1u) { - workgroupBarrier(); - if local_id.x >= 1u << i { - let other = sh_monoid[local_id.x - (1u << i)]; - agg = combine_tag_monoid(other, agg); - } - workgroupBarrier(); - sh_monoid[local_id.x] = agg; - } - workgroupBarrier(); - // prefix up to this workgroup - var tm = sh_parent[0]; - if local_id.x > 0u { - tm = combine_tag_monoid(tm, sh_monoid[local_id.x - 1u]); - } - // exclusive prefix sum, granularity of 4 tag bytes * workgroup size - tag_monoids[ix] = tm; -} diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl new file mode 100644 index 000000000..26c2b36d0 --- /dev/null +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -0,0 +1,403 @@ +// Copyright 2022 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +#import config +#import pathtag + +@group(0) @binding(0) +var config: Config; + +@group(0) @binding(1) +var scene: array; + +@group(0) @binding(2) +var reduced: array; + +@group(0) @binding(3) +var tag_monoids: array; + +@group(0) @binding(4) +var scan_bump: array>; + +//Workgroup info +let LG_WG_SIZE = 8u; +let WG_SIZE = 256u; + +//For the decoupled lookback +let FLAG_NOT_READY: u32 = 0; +let FLAG_REDUCTION: u32 = 1; +let FLAG_INCLUSIVE: u32 = 2; +let FLAG_MASK: u32 = 3; + +//For the decoupled fallback +let MAX_SPIN_COUNT: u32 = 4; +let LOCKED: u32 = 1; +let UNLOCKED: u32 = 0; + +var sh_broadcast: u32; +var sh_lock: u32; +var sh_scratch: array; +var sh_fallback: array; +var sh_tag_broadcast: TagMonoid; +var sh_fallback_state: array; + +@compute @workgroup_size(256) +fn main( + @builtin(local_invocation_id) local_id: vec3, +) { + //acquire the partition index, set the lock + if(local_id.x == 0u){ + sh_broadcast = atomicAdd(&scan_bump[0u], 1u); + sh_lock = LOCKED; + } + workgroupBarrier(); + let part_ix = sh_broadcast; + + //Local Scan, Hillis-Steel/Kogge-Stone + let tag_word = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; + var agg = reduce_tag(tag_word); + sh_scratch[local_id.x] = agg; + for (var i = 0u; i < LG_WG_SIZE; i += 1u) { + workgroupBarrier(); + if local_id.x >= 1u << i { + let other = sh_scratch[local_id.x - (1u << i)]; + agg = combine_tag_monoid(other, agg); + } + workgroupBarrier(); + sh_scratch[local_id.x] = agg; + } + + //Broadcast the results and results into device memory + if local_id.x == WG_SIZE - 1u { + if(part_ix != 0u){ + atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_REDUCTION); + atomicStore(&reduced[part_ix][1], (agg[1] << 2u) | FLAG_REDUCTION); + atomicStore(&reduced[part_ix][2], (agg[2] << 2u) | FLAG_REDUCTION); + atomicStore(&reduced[part_ix][3], (agg[3] << 2u) | FLAG_REDUCTION); + atomicStore(&reduced[part_ix][4], (agg[4] << 2u) | FLAG_REDUCTION); + } else { + atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_INCLUSIVE); + atomicStore(&reduced[part_ix][1], (agg[1] << 2u) | FLAG_INCLUSIVE); + atomicStore(&reduced[part_ix][2], (agg[2] << 2u) | FLAG_INCLUSIVE); + atomicStore(&reduced[part_ix][3], (agg[3] << 2u) | FLAG_INCLUSIVE); + atomicStore(&reduced[part_ix][4], (agg[4] << 2u) | FLAG_INCLUSIVE); + } + } + + //Lookback and potentially fallback + if(part_ix != 0u){ + var lookback_id = part_ix - 1u; + + var inc: array; + inc[0] = false; + inc[1] = false; + inc[2] = false; + inc[3] = false; + inc[4] = false; + + var prev: TagMonoid; + prev[0] = 0u; + prev[1] = 0u; + prev[2] = 0u; + prev[3] = 0u; + prev[4] = 0u; + + while(sh_lock == LOCKED){ + workgroupBarrier(); + + var red: array; + red[0] = false; + red[1] = false; + red[2] = false; + red[3] = false; + red[4] = false; + + //Lookback, with a single thread + if(local_id.x == WG_SIZE - 1u){ + for(var spin_count: u32 = 0u; spin_count < MAX_SPIN_COUNT; ){ + //TRANS_IX + if(!inc[0] && !red[0]){ + let payload = atomicLoad(&reduced[lookback_id][0]); + let flag_value = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + spin_count = 0u; + prev[0] += payload >> 2u; + red[0] = true; + } else if (flag_value == FLAG_INCLUSIVE){ + spin_count = 0u; + prev[0] += payload >> 2u; + atomicStore(&reduced[part_ix][0], ((agg[0] + prev[0]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[0] = prev[0]; + inc[0] = true; + } + } + + //PATHSEG_IX + if(!inc[1] && !red[1]){ + let payload = atomicLoad(&reduced[lookback_id][1]); + let flag_value = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + spin_count = 0u; + prev[1] += payload >> 2u; + red[1] = true; + } else if (flag_value == FLAG_INCLUSIVE){ + spin_count = 0u; + prev[1] += payload >> 2u; + atomicStore(&reduced[part_ix][1], ((agg[1] + prev[1]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[1] = prev[1]; + inc[1] = true; + } + } + + //PATHSEG_OFFSET + if(!inc[2] && !red[2]){ + let payload = atomicLoad(&reduced[lookback_id][2]); + let flag_value = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + spin_count = 0u; + prev[2] += payload >> 2u; + red[2] = true; + } else if (flag_value == FLAG_INCLUSIVE){ + spin_count = 0u; + prev[2] += payload >> 2u; + atomicStore(&reduced[part_ix][2], ((agg[2] + prev[2]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[2] = prev[2]; + inc[2] = true; + } + } + + //STYLE_IX + if(!inc[3] && !red[3]){ + let payload = atomicLoad(&reduced[lookback_id][3]); + let flag_value = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + spin_count = 0u; + prev[3] += payload >> 2u; + red[3] = true; + } else if (flag_value == FLAG_INCLUSIVE){ + spin_count = 0u; + prev[3] += payload >> 2u; + atomicStore(&reduced[part_ix][3], ((agg[3] + prev[3]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[3] = prev[3]; + inc[3] = true; + } + } + + //PATH_IX + if(!inc[4] && !red[4]){ + let payload = atomicLoad(&reduced[lookback_id][4]); + let flag_value = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + spin_count = 0u; + prev[4] += payload >> 2u; + red[4] = true; + } else if (flag_value == FLAG_INCLUSIVE){ + spin_count = 0u; + prev[4] += payload >> 2u; + atomicStore(&reduced[part_ix][4], ((agg[4] + prev[4]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[4] = prev[4]; + inc[4] = true; + } + } + + if((inc[0] || red[0]) && (inc[1] || red[1]) && (inc[2] || red[2]) && (inc[3] || red[3]) && (inc[4] || red[4])){ + if(inc[0] && inc[1] && inc[2] && inc[3] && inc[4]){ + sh_lock = UNLOCKED; + break; + } else { + lookback_id--; + red[0] = false; + red[1] = false; + red[2] = false; + red[3] = false; + red[4] = false; + } + } else { + spin_count++; + } + } + + //If we didn't complete the lookback within the allotted spins, + //prepare for the fallback by broadcasting the lookback tile id + //and states of the tagmonoid struct members + if(sh_lock == LOCKED){ + sh_broadcast = lookback_id; + sh_fallback_state[0] = !inc[0] && !red[0]; + sh_fallback_state[1] = !inc[1] && !red[1]; + sh_fallback_state[2] = !inc[2] && !red[2]; + sh_fallback_state[3] = !inc[3] && !red[3]; + sh_fallback_state[4] = !inc[4] && !red[4]; + } + } + workgroupBarrier(); + + //Fallback + if(sh_lock == LOCKED){ + let fallback_id = sh_broadcast; + + red[0] = sh_fallback_state[0]; + red[1] = sh_fallback_state[1]; + red[2] = sh_fallback_state[2]; + red[3] = sh_fallback_state[3]; + red[4] = sh_fallback_state[4]; + + //Fallback Reduce + //Is there an alternative to this besides a giant switch statement? + let f_word = scene[config.pathtag_base + local_id.x + fallback_id * WG_SIZE]; + var f_agg = reduce_tag(f_word); + sh_fallback[local_id.x] = f_agg; + for (var i = 0u; i < LG_WG_SIZE; i += 1u) { + workgroupBarrier(); + if local_id.x + (1u << i) < WG_SIZE { + let index = local_id.x + (1u << i); + if(red[0]){ + f_agg[0] += sh_fallback[index][0]; + } + if(red[1]){ + f_agg[1] += sh_fallback[index][1]; + } + if(red[2]){ + f_agg[2] += sh_fallback[index][2]; + } + if(red[3]){ + f_agg[3] += sh_fallback[index][3]; + } + if(red[4]){ + f_agg[4] += sh_fallback[index][4]; + } + } + workgroupBarrier(); + if(red[0]){ + sh_fallback[local_id.x][0] = f_agg[0]; + } + + if(red[1]){ + sh_fallback[local_id.x][1] = f_agg[1]; + } + + if(red[2]){ + sh_fallback[local_id.x][2] = f_agg[2]; + } + + if(red[3]){ + sh_fallback[local_id.x][3] = f_agg[3]; + } + + if(red[4]){ + sh_fallback[local_id.x][4] = f_agg[4]; + } + } + + //Fallback attempt insertion + if(local_id.x == WG_SIZE - 1u){ + //TRANS_IX FALLBACK + if(red[0]){ + let fallback_payload = (f_agg[0] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); + let prev_payload = atomicMax(&reduced[fallback_id][0], fallback_payload); + if(prev_payload == 0u){ + prev[0] += f_agg[0]; + } else { + prev[0] += prev_payload >> 2u; + } + if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][0], ((agg[0] + prev[0]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[0] = prev[0]; + inc[0] = true; + } + } + + //PATHSEG_IX FALLBACK + if(red[1]){ + let fallback_payload = (f_agg[1] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); + let prev_payload = atomicMax(&reduced[fallback_id][1], fallback_payload); + if(prev_payload == 0u){ + prev[1] += f_agg[1]; + } else { + prev[1] += prev_payload >> 2u; + } + if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][1], ((agg[1] + prev[1]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[1] = prev[1]; + inc[1] = true; + } + } + + //PATHSEG_OFFSET FALLBACK + if(red[2]){ + let fallback_payload = (f_agg[2] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); + let prev_payload = atomicMax(&reduced[fallback_id][2], fallback_payload); + if(prev_payload == 0u){ + prev[2] += f_agg[2]; + } else { + prev[2] += prev_payload >> 2u; + } + if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][2], ((agg[2] + prev[2]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[2] = prev[2]; + inc[2] = true; + } + } + + //STYLE_IX FALLBACK + if(red[3]){ + let fallback_payload = (f_agg[3] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); + let prev_payload = atomicMax(&reduced[fallback_id][3], fallback_payload); + if(prev_payload == 0u){ + prev[3] += f_agg[3]; + } else { + prev[3] += prev_payload >> 2u; + } + if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][3], ((agg[3] + prev[3]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[3] = prev[3]; + inc[3] = true; + } + } + + //PATH_IX FALLBACK + if(red[4]){ + let fallback_payload = (f_agg[4] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); + let prev_payload = atomicMax(&reduced[fallback_id][4], fallback_payload); + if(prev_payload == 0u){ + prev[4] += f_agg[4]; + } else { + prev[4] += prev_payload >> 2u; + } + if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][4], ((agg[4] + prev[4]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[4] = prev[4]; + inc[4] = true; + } + } + + //At this point, the reductions are guaranteed to be complete, + //so try unlocking, else, keep looking back + if(inc[0] && inc[1] && inc[2] && inc[3] && inc[4]){ + sh_lock = UNLOCKED; + } else { + lookback_id--; + } + } + workgroupBarrier(); + } + } + } + workgroupBarrier(); + + var tm: TagMonoid; + if(part_ix != 0u){ + tm = sh_tag_broadcast; + } else { + tm[0] = 0u; + tm[1] = 0u; + tm[2] = 0u; + tm[3] = 0u; + tm[4] = 0u; + } + + if(local_id.x != 0u){ + tm = combine_tag_monoid(tm, sh_scratch[local_id.x - 1u]); + } + + tag_monoids[local_id.x + part_ix * WG_SIZE] = tm; +} diff --git a/vello_shaders/shader/shared/pathtag.wgsl b/vello_shaders/shader/shared/pathtag.wgsl index b58995fc9..0ae3f1363 100644 --- a/vello_shaders/shader/shared/pathtag.wgsl +++ b/vello_shaders/shader/shared/pathtag.wgsl @@ -10,6 +10,15 @@ struct TagMonoid { path_ix: u32, } +struct TagMonoidAtomic { + trans_ix: atomic, + // TODO: I don't think pathseg_ix is used. + pathseg_ix: atomic, + pathseg_offset: atomic, + style_ix: atomic, + path_ix: atomic, +} + let PATH_TAG_SEG_TYPE = 3u; let PATH_TAG_LINETO = 1u; let PATH_TAG_QUADTO = 2u; diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index a129fce59..98f375051 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -25,8 +25,7 @@ mod path_count; mod path_count_setup; mod path_tiling; mod path_tiling_setup; -mod pathtag_reduce; -mod pathtag_scan; +mod pathtag_scan_single; mod tile_alloc; mod util; @@ -43,8 +42,7 @@ pub use path_count::path_count; pub use path_count_setup::path_count_setup; pub use path_tiling::path_tiling; pub use path_tiling_setup::path_tiling_setup; -pub use pathtag_reduce::pathtag_reduce; -pub use pathtag_scan::pathtag_scan; +pub use pathtag_scan_single::pathtag_scan_single; pub use tile_alloc::tile_alloc; use std::cell::{Ref, RefCell, RefMut}; diff --git a/vello_shaders/src/cpu/pathtag_reduce.rs b/vello_shaders/src/cpu/pathtag_reduce.rs deleted file mode 100644 index b7cc6b650..000000000 --- a/vello_shaders/src/cpu/pathtag_reduce.rs +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2023 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; - -use super::CpuBinding; - -const WG_SIZE: usize = 256; - -fn pathtag_reduce_main( - n_wg: u32, - config: &ConfigUniform, - scene: &[u32], - reduced: &mut [PathMonoid], -) { - let pathtag_base = config.layout.path_tag_base; - for i in 0..n_wg { - let mut m = PathMonoid::default(); - for j in 0..WG_SIZE { - let tag = scene[(pathtag_base + i * WG_SIZE as u32) as usize + j]; - m = m.combine(&PathMonoid::new(tag)); - } - reduced[i as usize] = m; - } -} - -pub fn pathtag_reduce(n_wg: u32, resources: &[CpuBinding]) { - let config = resources[0].as_typed(); - let scene = resources[1].as_slice(); - let mut reduced = resources[2].as_slice_mut(); - pathtag_reduce_main(n_wg, &config, &scene, &mut reduced); -} diff --git a/vello_shaders/src/cpu/pathtag_scan.rs b/vello_shaders/src/cpu/pathtag_scan.rs deleted file mode 100644 index 9627b0e7d..000000000 --- a/vello_shaders/src/cpu/pathtag_scan.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 the Vello Authors -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; - -use super::CpuBinding; - -const WG_SIZE: usize = 256; - -fn pathtag_scan_main( - n_wg: u32, - config: &ConfigUniform, - scene: &[u32], - reduced: &[PathMonoid], - tag_monoids: &mut [PathMonoid], -) { - let pathtag_base = config.layout.path_tag_base; - let mut prefix = PathMonoid::default(); - for i in 0..n_wg { - let mut m = prefix; - for j in 0..WG_SIZE { - let ix = (i * WG_SIZE as u32) as usize + j; - tag_monoids[ix] = m; - let tag = scene[pathtag_base as usize + ix]; - m = m.combine(&PathMonoid::new(tag)); - } - prefix = prefix.combine(&reduced[i as usize]); - } -} - -pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) { - let config = resources[0].as_typed(); - let scene = resources[1].as_slice(); - let reduced = resources[2].as_slice(); - let mut tag_monoids = resources[3].as_slice_mut(); - pathtag_scan_main(n_wg, &config, &scene, &reduced, &mut tag_monoids); -} diff --git a/vello_shaders/src/cpu/pathtag_scan_single.rs b/vello_shaders/src/cpu/pathtag_scan_single.rs new file mode 100644 index 000000000..a2926b132 --- /dev/null +++ b/vello_shaders/src/cpu/pathtag_scan_single.rs @@ -0,0 +1,30 @@ +// Copyright 2023 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +use vello_encoding::{ConfigUniform, Monoid, PathMonoid}; + +use super::CpuBinding; + +const WG_SIZE: usize = 256; + +fn pathtag_scan_single_main( + n_wg: u32, + config: &ConfigUniform, + scene: &[u32], + tag_monoids: &mut [PathMonoid], +) { + let size = n_wg * (WG_SIZE as u32); + let pathtag_base = config.layout.path_tag_base; + let mut prefix = PathMonoid::default(); + for i in 0..size { + tag_monoids[i as usize] = prefix; + prefix = prefix.combine(&PathMonoid::new(scene[(pathtag_base + i) as usize])); + } +} + +pub fn pathtag_scan_single(n_wg: u32, resources: &[CpuBinding]) { + let config = resources[0].as_typed(); + let scene = resources[1].as_slice(); + let mut tag_monoids = resources[3].as_slice_mut(); + pathtag_scan_single_main(n_wg, &config, &scene, &mut tag_monoids); +} \ No newline at end of file From d16c3b2467f191fe047e9de49003b68951376d3a Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:27:47 -0700 Subject: [PATCH 02/15] Update buffer name for clarity. --- vello/src/render.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vello/src/render.rs b/vello/src/render.rs index 6bc0541dc..8afd0d539 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -203,7 +203,7 @@ impl Render { ); let path_scan_bump_buf = BufferProxy::new( buffer_sizes.path_scan_bump.size_in_bytes().into(), - "bump_buf"); + "path_scan_bump_buf"); recording.clear_all(path_scan_bump_buf); recording.clear_all(reduced_buf); let path_scan_bump_buf = ResourceProxy::Buffer(path_scan_bump_buf); From 7c3e604a03dd4d51b71048c608833ef871db5a10 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:39:17 -0700 Subject: [PATCH 03/15] Change scan from struct to fixed size array to allow dynamic indexing. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 430 +++++++++--------- vello_shaders/shader/shared/pathtag.wgsl | 25 +- 2 files changed, 239 insertions(+), 216 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 26c2b36d0..812944e16 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -11,10 +11,10 @@ var config: Config; var scene: array; @group(0) @binding(2) -var reduced: array; +var reduced: array, 5>>; @group(0) @binding(3) -var tag_monoids: array; +var tag_monoids: array>; @group(0) @binding(4) var scan_bump: array>; @@ -36,11 +36,58 @@ let UNLOCKED: u32 = 0; var sh_broadcast: u32; var sh_lock: u32; -var sh_scratch: array; -var sh_fallback: array; -var sh_tag_broadcast: TagMonoid; +var sh_scratch: array, WG_SIZE>; +var sh_fallback: array, WG_SIZE>; +var sh_tag_broadcast: array; var sh_fallback_state: array; +fn attempt_lookback( + part_ix: u32, + lookback_ix: u32, + member_ix: u32, + aggregate: u32, + spin_count: ptr, + prev: ptr, + reduction_complete: ptr, + inclusive_complete: ptr +){ + let payload: u32 = atomicLoad(&reduced[lookback_ix][member_ix]); + let flag_value: u32 = payload & FLAG_MASK; + if(flag_value == FLAG_REDUCTION){ + *spin_count = 0u; + *prev += payload >> 2u; + *reduction_complete = true; + } else if (flag_value == FLAG_INCLUSIVE){ + *spin_count = 0u; + *prev += payload >> 2u; + atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[member_ix] = *prev; + *inclusive_complete = true; + } +} + +fn fallback( + part_ix: u32, + fallback_ix: u32, + member_ix: u32, + fallback_aggregate: u32, + prev: ptr, + inclusive_complete: ptr +){ + let fallback_payload = (fallback_aggregate << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); + let prev_payload = atomicMax(&reduced[fallback_ix][member_ix], fallback_payload); + if(prev_payload == 0u){ + *prev += fallback_aggregate; + } else { + *prev += prev_payload >> 2u; + } + if(fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + atomicStore(&reduced[part_ix][member_ix], ((fallback_aggregate + *prev) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[member_ix] = *prev; + *inclusive_complete = true; + } +} + @compute @workgroup_size(256) fn main( @builtin(local_invocation_id) local_id: vec3, @@ -51,23 +98,27 @@ fn main( sh_lock = LOCKED; } workgroupBarrier(); - let part_ix = sh_broadcast; + let part_ix: u32 = sh_broadcast; //Local Scan, Hillis-Steel/Kogge-Stone - let tag_word = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; - var agg = reduce_tag(tag_word); + let tag_word: u32 = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; + var agg: array = reduce_tag_arr(tag_word); sh_scratch[local_id.x] = agg; - for (var i = 0u; i < LG_WG_SIZE; i += 1u) { + for (var i: u32 = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); if local_id.x >= 1u << i { - let other = sh_scratch[local_id.x - (1u << i)]; - agg = combine_tag_monoid(other, agg); + let other: array = sh_scratch[local_id.x - (1u << i)]; + agg[0] += other[0]; + agg[1] += other[1]; + agg[2] += other[2]; + agg[3] += other[3]; + agg[4] += other[4]; } workgroupBarrier(); sh_scratch[local_id.x] = agg; } - //Broadcast the results and results into device memory + //Broadcast the results and flag into device memory if local_id.x == WG_SIZE - 1u { if(part_ix != 0u){ atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_REDUCTION); @@ -86,131 +137,110 @@ fn main( //Lookback and potentially fallback if(part_ix != 0u){ - var lookback_id = part_ix - 1u; + var lookback_ix = part_ix - 1u; - var inc: array; - inc[0] = false; - inc[1] = false; - inc[2] = false; - inc[3] = false; - inc[4] = false; - - var prev: TagMonoid; - prev[0] = 0u; - prev[1] = 0u; - prev[2] = 0u; - prev[3] = 0u; - prev[4] = 0u; + var inc0: bool = false; + var inc1: bool = false; + var inc2: bool = false; + var inc3: bool = false; + var inc4: bool = false; + + var prev0: u32 = 0u; + var prev1: u32 = 0u; + var prev2: u32 = 0u; + var prev3: u32 = 0u; + var prev4: u32 = 0u; while(sh_lock == LOCKED){ workgroupBarrier(); - var red: array; - red[0] = false; - red[1] = false; - red[2] = false; - red[3] = false; - red[4] = false; - + var red0: bool = false; + var red1: bool = false; + var red2: bool = false; + var red3: bool = false; + var red4: bool = false; + //Lookback, with a single thread + //Last thread in the workgroup has the complete aggregate if(local_id.x == WG_SIZE - 1u){ for(var spin_count: u32 = 0u; spin_count < MAX_SPIN_COUNT; ){ //TRANS_IX - if(!inc[0] && !red[0]){ - let payload = atomicLoad(&reduced[lookback_id][0]); - let flag_value = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ - spin_count = 0u; - prev[0] += payload >> 2u; - red[0] = true; - } else if (flag_value == FLAG_INCLUSIVE){ - spin_count = 0u; - prev[0] += payload >> 2u; - atomicStore(&reduced[part_ix][0], ((agg[0] + prev[0]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[0] = prev[0]; - inc[0] = true; - } + if(!inc0 && !red0){ + attempt_lookback( + part_ix, + lookback_ix, + 0u, + agg[0u], + &spin_count, + &prev0, + &red0, + &inc0); } //PATHSEG_IX - if(!inc[1] && !red[1]){ - let payload = atomicLoad(&reduced[lookback_id][1]); - let flag_value = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ - spin_count = 0u; - prev[1] += payload >> 2u; - red[1] = true; - } else if (flag_value == FLAG_INCLUSIVE){ - spin_count = 0u; - prev[1] += payload >> 2u; - atomicStore(&reduced[part_ix][1], ((agg[1] + prev[1]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[1] = prev[1]; - inc[1] = true; - } + if(!inc1 && !red1){ + attempt_lookback( + part_ix, + lookback_ix, + 1u, + agg[1u], + &spin_count, + &prev1, + &red1, + &inc1); } //PATHSEG_OFFSET - if(!inc[2] && !red[2]){ - let payload = atomicLoad(&reduced[lookback_id][2]); - let flag_value = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ - spin_count = 0u; - prev[2] += payload >> 2u; - red[2] = true; - } else if (flag_value == FLAG_INCLUSIVE){ - spin_count = 0u; - prev[2] += payload >> 2u; - atomicStore(&reduced[part_ix][2], ((agg[2] + prev[2]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[2] = prev[2]; - inc[2] = true; - } + if(!inc2 && !red2){ + attempt_lookback( + part_ix, + lookback_ix, + 2u, + agg[2u], + &spin_count, + &prev2, + &red2, + &inc2); } //STYLE_IX - if(!inc[3] && !red[3]){ - let payload = atomicLoad(&reduced[lookback_id][3]); - let flag_value = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ - spin_count = 0u; - prev[3] += payload >> 2u; - red[3] = true; - } else if (flag_value == FLAG_INCLUSIVE){ - spin_count = 0u; - prev[3] += payload >> 2u; - atomicStore(&reduced[part_ix][3], ((agg[3] + prev[3]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[3] = prev[3]; - inc[3] = true; - } + if(!inc3 && !red3){ + attempt_lookback( + part_ix, + lookback_ix, + 3u, + agg[3u], + &spin_count, + &prev3, + &red3, + &inc3); } //PATH_IX - if(!inc[4] && !red[4]){ - let payload = atomicLoad(&reduced[lookback_id][4]); - let flag_value = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ - spin_count = 0u; - prev[4] += payload >> 2u; - red[4] = true; - } else if (flag_value == FLAG_INCLUSIVE){ - spin_count = 0u; - prev[4] += payload >> 2u; - atomicStore(&reduced[part_ix][4], ((agg[4] + prev[4]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[4] = prev[4]; - inc[4] = true; - } + if(!inc4 && !red4){ + attempt_lookback( + part_ix, + lookback_ix, + 4u, + agg[4u], + &spin_count, + &prev4, + &red4, + &inc4); } - if((inc[0] || red[0]) && (inc[1] || red[1]) && (inc[2] || red[2]) && (inc[3] || red[3]) && (inc[4] || red[4])){ - if(inc[0] && inc[1] && inc[2] && inc[3] && inc[4]){ + //Have we completed the current reduction or inclusive sum for all PathTag members? + if((inc0 || red0) && (inc1 || red1) && (inc2 || red2) && (inc3 || red3) && (inc4 || red4)){ + if(inc0 && inc1 && inc2 && inc3 && inc4){ sh_lock = UNLOCKED; break; } else { - lookback_id--; - red[0] = false; - red[1] = false; - red[2] = false; - red[3] = false; - red[4] = false; + lookback_ix--; + red0 = false; + red1 = false; + red2 = false; + red3 = false; + red4 = false; } } else { spin_count++; @@ -221,161 +251,142 @@ fn main( //prepare for the fallback by broadcasting the lookback tile id //and states of the tagmonoid struct members if(sh_lock == LOCKED){ - sh_broadcast = lookback_id; - sh_fallback_state[0] = !inc[0] && !red[0]; - sh_fallback_state[1] = !inc[1] && !red[1]; - sh_fallback_state[2] = !inc[2] && !red[2]; - sh_fallback_state[3] = !inc[3] && !red[3]; - sh_fallback_state[4] = !inc[4] && !red[4]; + sh_broadcast = lookback_ix; + sh_fallback_state[0] = !inc0 && !red0; + sh_fallback_state[1] = !inc1 && !red1; + sh_fallback_state[2] = !inc2 && !red2; + sh_fallback_state[3] = !inc3 && !red3; + sh_fallback_state[4] = !inc4 && !red4; } } workgroupBarrier(); //Fallback if(sh_lock == LOCKED){ - let fallback_id = sh_broadcast; + let fallback_ix = sh_broadcast; - red[0] = sh_fallback_state[0]; - red[1] = sh_fallback_state[1]; - red[2] = sh_fallback_state[2]; - red[3] = sh_fallback_state[3]; - red[4] = sh_fallback_state[4]; + red0 = sh_fallback_state[0]; + red1 = sh_fallback_state[1]; + red2 = sh_fallback_state[2]; + red3 = sh_fallback_state[3]; + red4 = sh_fallback_state[4]; //Fallback Reduce - //Is there an alternative to this besides a giant switch statement? - let f_word = scene[config.pathtag_base + local_id.x + fallback_id * WG_SIZE]; - var f_agg = reduce_tag(f_word); + //Is there an alternative to this besides a giant switch statement or + //5 individual reductions? + let f_word: u32 = scene[config.pathtag_base + local_id.x + fallback_ix * WG_SIZE]; + var f_agg: array = reduce_tag_arr(f_word); sh_fallback[local_id.x] = f_agg; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); if local_id.x + (1u << i) < WG_SIZE { let index = local_id.x + (1u << i); - if(red[0]){ + if(red0){ f_agg[0] += sh_fallback[index][0]; } - if(red[1]){ + if(red1){ f_agg[1] += sh_fallback[index][1]; } - if(red[2]){ + if(red2){ f_agg[2] += sh_fallback[index][2]; } - if(red[3]){ + if(red3){ f_agg[3] += sh_fallback[index][3]; } - if(red[4]){ + if(red4){ f_agg[4] += sh_fallback[index][4]; } } workgroupBarrier(); - if(red[0]){ + if(red0){ sh_fallback[local_id.x][0] = f_agg[0]; } - if(red[1]){ + if(red1){ sh_fallback[local_id.x][1] = f_agg[1]; } - if(red[2]){ + if(red2){ sh_fallback[local_id.x][2] = f_agg[2]; } - if(red[3]){ + if(red3){ sh_fallback[local_id.x][3] = f_agg[3]; } - if(red[4]){ + if(red4){ sh_fallback[local_id.x][4] = f_agg[4]; } } - //Fallback attempt insertion + //Fallback and attempt insertion of status flag if(local_id.x == WG_SIZE - 1u){ //TRANS_IX FALLBACK - if(red[0]){ - let fallback_payload = (f_agg[0] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); - let prev_payload = atomicMax(&reduced[fallback_id][0], fallback_payload); - if(prev_payload == 0u){ - prev[0] += f_agg[0]; - } else { - prev[0] += prev_payload >> 2u; - } - if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][0], ((agg[0] + prev[0]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[0] = prev[0]; - inc[0] = true; - } + if(red0){ + fallback( + part_ix, + fallback_ix, + 0u, + f_agg[0u], + &prev0, + &inc0, + ); } //PATHSEG_IX FALLBACK - if(red[1]){ - let fallback_payload = (f_agg[1] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); - let prev_payload = atomicMax(&reduced[fallback_id][1], fallback_payload); - if(prev_payload == 0u){ - prev[1] += f_agg[1]; - } else { - prev[1] += prev_payload >> 2u; - } - if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][1], ((agg[1] + prev[1]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[1] = prev[1]; - inc[1] = true; - } + if(red1){ + fallback( + part_ix, + fallback_ix, + 1u, + f_agg[1u], + &prev1, + &inc1, + ); } //PATHSEG_OFFSET FALLBACK - if(red[2]){ - let fallback_payload = (f_agg[2] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); - let prev_payload = atomicMax(&reduced[fallback_id][2], fallback_payload); - if(prev_payload == 0u){ - prev[2] += f_agg[2]; - } else { - prev[2] += prev_payload >> 2u; - } - if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][2], ((agg[2] + prev[2]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[2] = prev[2]; - inc[2] = true; - } + if(red2){ + fallback( + part_ix, + fallback_ix, + 2u, + f_agg[2u], + &prev2, + &inc2, + ); } //STYLE_IX FALLBACK - if(red[3]){ - let fallback_payload = (f_agg[3] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); - let prev_payload = atomicMax(&reduced[fallback_id][3], fallback_payload); - if(prev_payload == 0u){ - prev[3] += f_agg[3]; - } else { - prev[3] += prev_payload >> 2u; - } - if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][3], ((agg[3] + prev[3]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[3] = prev[3]; - inc[3] = true; - } + if(red3){ + fallback( + part_ix, + fallback_ix, + 3u, + f_agg[3u], + &prev3, + &inc3, + ); } //PATH_IX FALLBACK - if(red[4]){ - let fallback_payload = (f_agg[4] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_id != 0u); - let prev_payload = atomicMax(&reduced[fallback_id][4], fallback_payload); - if(prev_payload == 0u){ - prev[4] += f_agg[4]; - } else { - prev[4] += prev_payload >> 2u; - } - if(fallback_id == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][4], ((agg[4] + prev[4]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[4] = prev[4]; - inc[4] = true; - } + if(red4){ + fallback( + part_ix, + fallback_ix, + 4u, + f_agg[4u], + &prev4, + &inc4, + ); } //At this point, the reductions are guaranteed to be complete, //so try unlocking, else, keep looking back - if(inc[0] && inc[1] && inc[2] && inc[3] && inc[4]){ + if(inc0 && inc1 && inc2 && inc3 && inc4){ sh_lock = UNLOCKED; } else { - lookback_id--; + lookback_ix--; } } workgroupBarrier(); @@ -384,7 +395,7 @@ fn main( } workgroupBarrier(); - var tm: TagMonoid; + var tm: array; if(part_ix != 0u){ tm = sh_tag_broadcast; } else { @@ -396,8 +407,13 @@ fn main( } if(local_id.x != 0u){ - tm = combine_tag_monoid(tm, sh_scratch[local_id.x - 1u]); - } + let other: array = sh_scratch[local_id.x - 1u]; + tm[0] += other[0]; + tm[1] += other[1]; + tm[2] += other[2]; + tm[3] += other[3]; + tm[4] += other[4]; + } tag_monoids[local_id.x + part_ix * WG_SIZE] = tm; } diff --git a/vello_shaders/shader/shared/pathtag.wgsl b/vello_shaders/shader/shared/pathtag.wgsl index 0ae3f1363..b62e7cef6 100644 --- a/vello_shaders/shader/shared/pathtag.wgsl +++ b/vello_shaders/shader/shared/pathtag.wgsl @@ -10,15 +10,6 @@ struct TagMonoid { path_ix: u32, } -struct TagMonoidAtomic { - trans_ix: atomic, - // TODO: I don't think pathseg_ix is used. - pathseg_ix: atomic, - pathseg_offset: atomic, - style_ix: atomic, - path_ix: atomic, -} - let PATH_TAG_SEG_TYPE = 3u; let PATH_TAG_LINETO = 1u; let PATH_TAG_QUADTO = 2u; @@ -78,3 +69,19 @@ fn reduce_tag(tag_word: u32) -> TagMonoid { c.style_ix = countOneBits(tag_word & (PATH_TAG_STYLE * 0x1010101u)) * STYLE_SIZE_IN_WORDS; return c; } + +//An alternate version for the scan, using array instead of TagMonoid +fn reduce_tag_arr(tag_word: u32) -> array { + var c: array; + let point_count = tag_word & 0x3030303u; + c[1] = countOneBits((point_count * 7u) & 0x4040404u); + c[0] = countOneBits(tag_word & (PATH_TAG_TRANSFORM * 0x1010101u)); + let n_points = point_count + ((tag_word >> 2u) & 0x1010101u); + var a = n_points + (n_points & (((tag_word >> 3u) & 0x1010101u) * 15u)); + a += a >> 8u; + a += a >> 16u; + c[2] = a & 0xffu; + c[4] = countOneBits(tag_word & (PATH_TAG_PATH * 0x1010101u)); + c[3] = countOneBits(tag_word & (PATH_TAG_STYLE * 0x1010101u)) * STYLE_SIZE_IN_WORDS; + return c; +} \ No newline at end of file From d26945bc65a19f539528c55333710c77dd407a2b Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:45:44 -0700 Subject: [PATCH 04/15] Remove unused workgroup and buffer sizes. --- vello_encoding/src/config.rs | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/vello_encoding/src/config.rs b/vello_encoding/src/config.rs index 6851f27d5..4ad59cc20 100644 --- a/vello_encoding/src/config.rs +++ b/vello_encoding/src/config.rs @@ -203,10 +203,6 @@ pub type WorkgroupSize = (u32, u32, u32); /// Computed sizes for all dispatches. #[derive(Copy, Clone, Debug, Default)] pub struct WorkgroupCounts { - pub use_large_path_scan: bool, - pub path_reduce: WorkgroupSize, - pub path_reduce2: WorkgroupSize, - pub path_scan1: WorkgroupSize, pub path_scan: WorkgroupSize, pub bbox_clear: WorkgroupSize, pub flatten: WorkgroupSize, @@ -237,12 +233,6 @@ impl WorkgroupCounts { let n_clips = layout.n_clips; let path_tag_padded = align_up(n_path_tags, 4 * PATH_REDUCE_WG); let path_tag_wgs = path_tag_padded / (4 * PATH_REDUCE_WG); - let use_large_path_scan = path_tag_wgs > PATH_REDUCE_WG; - let reduced_size = if use_large_path_scan { - align_up(path_tag_wgs, PATH_REDUCE_WG) - } else { - path_tag_wgs - }; let draw_object_wgs = (n_draw_objects + PATH_BBOX_WG - 1) / PATH_BBOX_WG; let draw_monoid_wgs = draw_object_wgs.min(PATH_BBOX_WG); let flatten_wgs = (n_path_tags + FLATTEN_WG - 1) / FLATTEN_WG; @@ -252,10 +242,6 @@ impl WorkgroupCounts { let width_in_bins = (width_in_tiles + 15) / 16; let height_in_bins = (height_in_tiles + 15) / 16; Self { - use_large_path_scan, - path_reduce: (path_tag_wgs, 1, 1), - path_reduce2: (PATH_REDUCE_WG, 1, 1), - path_scan1: (reduced_size / PATH_REDUCE_WG, 1, 1), path_scan: (path_tag_wgs, 1, 1), bbox_clear: (draw_object_wgs, 1, 1), flatten: (flatten_wgs, 1, 1), @@ -364,13 +350,8 @@ impl BufferSizes { let n_paths = layout.n_paths; let n_draw_objects = layout.n_draw_objects; let n_clips = layout.n_clips; - let path_tag_wgs = workgroups.path_reduce.0; - let reduced_size = if workgroups.use_large_path_scan { - align_up(path_tag_wgs, PATH_REDUCE_WG) - } else { - path_tag_wgs - }; - let path_reduced = BufferSize::new(reduced_size); + let path_tag_wgs = workgroups.path_scan.0; + let path_reduced = BufferSize::new(path_tag_wgs); let path_scan_bump = BufferSize::new(1); let path_monoids = BufferSize::new(path_tag_wgs * PATH_REDUCE_WG); let path_bboxes = BufferSize::new(n_paths); From 383d37ec624a2185004c2f5e93b9f382dae139df Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Wed, 18 Sep 2024 20:46:16 -0700 Subject: [PATCH 05/15] Cargo fmt fix --- vello/src/render.rs | 21 ++++++++++++++------ vello/src/shaders.rs | 3 ++- vello_shaders/src/cpu/pathtag_scan_single.rs | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/vello/src/render.rs b/vello/src/render.rs index 81aae53b7..6e8749440 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -178,7 +178,6 @@ impl Render { // is zero. packed.resize(size_of::(), u8::MAX); } - let scene_buf = ResourceProxy::Buffer(recording.upload("vello.scene", packed)); let config_buf = ResourceProxy::Buffer( recording.upload_uniform("vello.config", bytemuck::bytes_of(&cpu_config.gpu)), @@ -189,9 +188,12 @@ impl Render { ); let tile_buf = ResourceProxy::new_buf(buffer_sizes.tiles.size_in_bytes().into(), "vello.tile_buf"); - let segments_buf = - ResourceProxy::new_buf(buffer_sizes.segments.size_in_bytes().into(), "vello.segments_buf"); - let ptcl_buf = ResourceProxy::new_buf(buffer_sizes.ptcl.size_in_bytes().into(), "vello.ptcl_buf"); + let segments_buf = ResourceProxy::new_buf( + buffer_sizes.segments.size_in_bytes().into(), + "vello.segments_buf", + ); + let ptcl_buf = + ResourceProxy::new_buf(buffer_sizes.ptcl.size_in_bytes().into(), "vello.ptcl_buf"); let tagmonoid_buf = ResourceProxy::new_buf( buffer_sizes.path_monoids.size_in_bytes().into(), "vello.tagmonoid_buf", @@ -202,7 +204,8 @@ impl Render { ); let path_scan_bump_buf = BufferProxy::new( buffer_sizes.path_scan_bump.size_in_bytes().into(), - "path_scan_bump_buf"); + "path_scan_bump_buf", + ); recording.clear_all(path_scan_bump_buf); recording.clear_all(reduced_buf); let path_scan_bump_buf = ResourceProxy::Buffer(path_scan_bump_buf); @@ -210,7 +213,13 @@ impl Render { recording.dispatch( shaders.pathtag_scan_csdldf, wg_counts.path_scan, - [config_buf, scene_buf, reduced_buf, tagmonoid_buf, path_scan_bump_buf], + [ + config_buf, + scene_buf, + reduced_buf, + tagmonoid_buf, + path_scan_bump_buf, + ], ); recording.free_resource(reduced_buf); recording.free_resource(path_scan_bump_buf); diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index 11152337c..241928949 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -97,7 +97,8 @@ pub(crate) fn full_shaders( let pathtag_scan_csdldf = add_shader!( pathtag_scan_csdldf, [Uniform, BufReadOnly, Buffer, Buffer, Buffer], - CpuShaderType::Present(vello_shaders::cpu::pathtag_scan_single)); + CpuShaderType::Present(vello_shaders::cpu::pathtag_scan_single) + ); let bbox_clear = add_shader!(bbox_clear, [Uniform, Buffer]); let flatten = add_shader!( diff --git a/vello_shaders/src/cpu/pathtag_scan_single.rs b/vello_shaders/src/cpu/pathtag_scan_single.rs index a2926b132..53858509a 100644 --- a/vello_shaders/src/cpu/pathtag_scan_single.rs +++ b/vello_shaders/src/cpu/pathtag_scan_single.rs @@ -27,4 +27,4 @@ pub fn pathtag_scan_single(n_wg: u32, resources: &[CpuBinding]) { let scene = resources[1].as_slice(); let mut tag_monoids = resources[3].as_slice_mut(); pathtag_scan_single_main(n_wg, &config, &scene, &mut tag_monoids); -} \ No newline at end of file +} From 28193eb952c0cf5f6236551d16ffc32adf14d759 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:08:01 -0700 Subject: [PATCH 06/15] Fix incorrect transcription of function. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 812944e16..f50ea5038 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -70,6 +70,7 @@ fn fallback( part_ix: u32, fallback_ix: u32, member_ix: u32, + aggregate: u32, fallback_aggregate: u32, prev: ptr, inclusive_complete: ptr @@ -82,7 +83,7 @@ fn fallback( *prev += prev_payload >> 2u; } if(fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ - atomicStore(&reduced[part_ix][member_ix], ((fallback_aggregate + *prev) << 2u) | FLAG_INCLUSIVE); + atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); sh_tag_broadcast[member_ix] = *prev; *inclusive_complete = true; } @@ -327,6 +328,7 @@ fn main( part_ix, fallback_ix, 0u, + agg[0u], f_agg[0u], &prev0, &inc0, @@ -339,6 +341,7 @@ fn main( part_ix, fallback_ix, 1u, + agg[1u], f_agg[1u], &prev1, &inc1, @@ -351,6 +354,7 @@ fn main( part_ix, fallback_ix, 2u, + agg[2u], f_agg[2u], &prev2, &inc2, @@ -363,6 +367,7 @@ fn main( part_ix, fallback_ix, 3u, + agg[3u], f_agg[3u], &prev3, &inc3, @@ -375,6 +380,7 @@ fn main( part_ix, fallback_ix, 4u, + agg[4u], f_agg[4u], &prev4, &inc4, From 76235ca6445c376a3e60d1e36dd711b5647cc827 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:01:42 -0700 Subject: [PATCH 07/15] Style fixes --- vello/src/render.rs | 4 +- vello/src/shaders.rs | 2 +- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 108 +++++++++--------- vello_shaders/shader/shared/pathtag.wgsl | 2 +- vello_shaders/src/cpu.rs | 2 +- vello_shaders/src/cpu/pathtag_scan_single.rs | 6 +- 6 files changed, 62 insertions(+), 62 deletions(-) diff --git a/vello/src/render.rs b/vello/src/render.rs index 6e8749440..68e0270b1 100644 --- a/vello/src/render.rs +++ b/vello/src/render.rs @@ -200,11 +200,11 @@ impl Render { ); let reduced_buf = BufferProxy::new( buffer_sizes.path_reduced.size_in_bytes().into(), - "reduced_buf", + "vello.reduced_buf", ); let path_scan_bump_buf = BufferProxy::new( buffer_sizes.path_scan_bump.size_in_bytes().into(), - "path_scan_bump_buf", + "vello.path_scan_bump_buf", ); recording.clear_all(path_scan_bump_buf); recording.clear_all(reduced_buf); diff --git a/vello/src/shaders.rs b/vello/src/shaders.rs index 241928949..7b0ff1ee2 100644 --- a/vello/src/shaders.rs +++ b/vello/src/shaders.rs @@ -97,7 +97,7 @@ pub(crate) fn full_shaders( let pathtag_scan_csdldf = add_shader!( pathtag_scan_csdldf, [Uniform, BufReadOnly, Buffer, Buffer, Buffer], - CpuShaderType::Present(vello_shaders::cpu::pathtag_scan_single) + CpuShaderType::Present(vello_shaders::cpu::pathtag_scan) ); let bbox_clear = add_shader!(bbox_clear, [Uniform, Buffer]); diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index f50ea5038..0ca8ba7b9 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -17,7 +17,7 @@ var reduced: array, 5>>; var tag_monoids: array>; @group(0) @binding(4) -var scan_bump: array>; +var scan_bump: atomic; //Workgroup info let LG_WG_SIZE = 8u; @@ -53,11 +53,11 @@ fn attempt_lookback( ){ let payload: u32 = atomicLoad(&reduced[lookback_ix][member_ix]); let flag_value: u32 = payload & FLAG_MASK; - if(flag_value == FLAG_REDUCTION){ + if flag_value == FLAG_REDUCTION { *spin_count = 0u; *prev += payload >> 2u; *reduction_complete = true; - } else if (flag_value == FLAG_INCLUSIVE){ + } else if flag_value == FLAG_INCLUSIVE { *spin_count = 0u; *prev += payload >> 2u; atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); @@ -77,12 +77,12 @@ fn fallback( ){ let fallback_payload = (fallback_aggregate << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); let prev_payload = atomicMax(&reduced[fallback_ix][member_ix], fallback_payload); - if(prev_payload == 0u){ + if prev_payload == 0u { *prev += fallback_aggregate; } else { *prev += prev_payload >> 2u; } - if(fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE){ + if fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE { atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); sh_tag_broadcast[member_ix] = *prev; *inclusive_complete = true; @@ -94,8 +94,8 @@ fn main( @builtin(local_invocation_id) local_id: vec3, ) { //acquire the partition index, set the lock - if(local_id.x == 0u){ - sh_broadcast = atomicAdd(&scan_bump[0u], 1u); + if local_id.x == 0u { + sh_broadcast = atomicAdd(&scan_bump, 1u); sh_lock = LOCKED; } workgroupBarrier(); @@ -121,7 +121,7 @@ fn main( //Broadcast the results and flag into device memory if local_id.x == WG_SIZE - 1u { - if(part_ix != 0u){ + if part_ix != 0u { atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_REDUCTION); atomicStore(&reduced[part_ix][1], (agg[1] << 2u) | FLAG_REDUCTION); atomicStore(&reduced[part_ix][2], (agg[2] << 2u) | FLAG_REDUCTION); @@ -137,7 +137,7 @@ fn main( } //Lookback and potentially fallback - if(part_ix != 0u){ + if part_ix != 0u { var lookback_ix = part_ix - 1u; var inc0: bool = false; @@ -163,15 +163,15 @@ fn main( //Lookback, with a single thread //Last thread in the workgroup has the complete aggregate - if(local_id.x == WG_SIZE - 1u){ - for(var spin_count: u32 = 0u; spin_count < MAX_SPIN_COUNT; ){ + if local_id.x == WG_SIZE - 1u { + for (var spin_count: u32 = 0u; spin_count < MAX_SPIN_COUNT; ) { //TRANS_IX - if(!inc0 && !red0){ + if !inc0 && !red0 { attempt_lookback( part_ix, lookback_ix, 0u, - agg[0u], + agg[0], &spin_count, &prev0, &red0, @@ -179,12 +179,12 @@ fn main( } //PATHSEG_IX - if(!inc1 && !red1){ + if !inc1 && !red1 { attempt_lookback( part_ix, lookback_ix, 1u, - agg[1u], + agg[1], &spin_count, &prev1, &red1, @@ -192,12 +192,12 @@ fn main( } //PATHSEG_OFFSET - if(!inc2 && !red2){ + if !inc2 && !red2 { attempt_lookback( part_ix, lookback_ix, 2u, - agg[2u], + agg[2], &spin_count, &prev2, &red2, @@ -205,12 +205,12 @@ fn main( } //STYLE_IX - if(!inc3 && !red3){ + if !inc3 && !red3 { attempt_lookback( part_ix, lookback_ix, 3u, - agg[3u], + agg[3], &spin_count, &prev3, &red3, @@ -218,12 +218,12 @@ fn main( } //PATH_IX - if(!inc4 && !red4){ + if !inc4 && !red4 { attempt_lookback( part_ix, lookback_ix, 4u, - agg[4u], + agg[4], &spin_count, &prev4, &red4, @@ -231,8 +231,8 @@ fn main( } //Have we completed the current reduction or inclusive sum for all PathTag members? - if((inc0 || red0) && (inc1 || red1) && (inc2 || red2) && (inc3 || red3) && (inc4 || red4)){ - if(inc0 && inc1 && inc2 && inc3 && inc4){ + if (inc0 || red0) && (inc1 || red1) && (inc2 || red2) && (inc3 || red3) && (inc4 || red4) { + if inc0 && inc1 && inc2 && inc3 && inc4 { sh_lock = UNLOCKED; break; } else { @@ -251,7 +251,7 @@ fn main( //If we didn't complete the lookback within the allotted spins, //prepare for the fallback by broadcasting the lookback tile id //and states of the tagmonoid struct members - if(sh_lock == LOCKED){ + if sh_lock == LOCKED { sh_broadcast = lookback_ix; sh_fallback_state[0] = !inc0 && !red0; sh_fallback_state[1] = !inc1 && !red1; @@ -263,7 +263,7 @@ fn main( workgroupBarrier(); //Fallback - if(sh_lock == LOCKED){ + if sh_lock == LOCKED { let fallback_ix = sh_broadcast; red0 = sh_fallback_state[0]; @@ -282,106 +282,106 @@ fn main( workgroupBarrier(); if local_id.x + (1u << i) < WG_SIZE { let index = local_id.x + (1u << i); - if(red0){ + if red0 { f_agg[0] += sh_fallback[index][0]; } - if(red1){ + if red1 { f_agg[1] += sh_fallback[index][1]; } - if(red2){ + if red2 { f_agg[2] += sh_fallback[index][2]; } - if(red3){ + if red3 { f_agg[3] += sh_fallback[index][3]; } - if(red4){ + if red4 { f_agg[4] += sh_fallback[index][4]; } } workgroupBarrier(); - if(red0){ + if red0 { sh_fallback[local_id.x][0] = f_agg[0]; } - if(red1){ + if red1 { sh_fallback[local_id.x][1] = f_agg[1]; } - if(red2){ + if red2 { sh_fallback[local_id.x][2] = f_agg[2]; } - if(red3){ + if red3 { sh_fallback[local_id.x][3] = f_agg[3]; } - if(red4){ + if red4 { sh_fallback[local_id.x][4] = f_agg[4]; } } //Fallback and attempt insertion of status flag - if(local_id.x == WG_SIZE - 1u){ + if local_id.x == WG_SIZE - 1u { //TRANS_IX FALLBACK - if(red0){ + if red0 { fallback( part_ix, fallback_ix, 0u, - agg[0u], - f_agg[0u], + agg[0], + f_agg[0], &prev0, &inc0, ); } //PATHSEG_IX FALLBACK - if(red1){ + if red1 { fallback( part_ix, fallback_ix, 1u, - agg[1u], - f_agg[1u], + agg[1], + f_agg[1], &prev1, &inc1, ); } //PATHSEG_OFFSET FALLBACK - if(red2){ + if red2 { fallback( part_ix, fallback_ix, 2u, - agg[2u], - f_agg[2u], + agg[2], + f_agg[2], &prev2, &inc2, ); } //STYLE_IX FALLBACK - if(red3){ + if red3 { fallback( part_ix, fallback_ix, 3u, - agg[3u], - f_agg[3u], + agg[3], + f_agg[3], &prev3, &inc3, ); } //PATH_IX FALLBACK - if(red4){ + if red4 { fallback( part_ix, fallback_ix, 4u, - agg[4u], - f_agg[4u], + agg[4], + f_agg[4], &prev4, &inc4, ); @@ -389,7 +389,7 @@ fn main( //At this point, the reductions are guaranteed to be complete, //so try unlocking, else, keep looking back - if(inc0 && inc1 && inc2 && inc3 && inc4){ + if inc0 && inc1 && inc2 && inc3 && inc4 { sh_lock = UNLOCKED; } else { lookback_ix--; @@ -402,7 +402,7 @@ fn main( workgroupBarrier(); var tm: array; - if(part_ix != 0u){ + if part_ix != 0u { tm = sh_tag_broadcast; } else { tm[0] = 0u; @@ -412,7 +412,7 @@ fn main( tm[4] = 0u; } - if(local_id.x != 0u){ + if local_id.x != 0u { let other: array = sh_scratch[local_id.x - 1u]; tm[0] += other[0]; tm[1] += other[1]; diff --git a/vello_shaders/shader/shared/pathtag.wgsl b/vello_shaders/shader/shared/pathtag.wgsl index b62e7cef6..aa4dc93ad 100644 --- a/vello_shaders/shader/shared/pathtag.wgsl +++ b/vello_shaders/shader/shared/pathtag.wgsl @@ -84,4 +84,4 @@ fn reduce_tag_arr(tag_word: u32) -> array { c[4] = countOneBits(tag_word & (PATH_TAG_PATH * 0x1010101u)); c[3] = countOneBits(tag_word & (PATH_TAG_STYLE * 0x1010101u)) * STYLE_SIZE_IN_WORDS; return c; -} \ No newline at end of file +} diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index 2aa8d677c..5df1a52cb 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -42,7 +42,7 @@ pub use path_count::path_count; pub use path_count_setup::path_count_setup; pub use path_tiling::path_tiling; pub use path_tiling_setup::path_tiling_setup; -pub use pathtag_scan_single::pathtag_scan_single; +pub use pathtag_scan_single::pathtag_scan; pub use tile_alloc::tile_alloc; use std::cell::{Ref, RefCell, RefMut}; diff --git a/vello_shaders/src/cpu/pathtag_scan_single.rs b/vello_shaders/src/cpu/pathtag_scan_single.rs index 53858509a..79c1b3a22 100644 --- a/vello_shaders/src/cpu/pathtag_scan_single.rs +++ b/vello_shaders/src/cpu/pathtag_scan_single.rs @@ -7,7 +7,7 @@ use super::CpuBinding; const WG_SIZE: usize = 256; -fn pathtag_scan_single_main( +fn pathtag_scan_main( n_wg: u32, config: &ConfigUniform, scene: &[u32], @@ -22,9 +22,9 @@ fn pathtag_scan_single_main( } } -pub fn pathtag_scan_single(n_wg: u32, resources: &[CpuBinding]) { +pub fn pathtag_scan(n_wg: u32, resources: &[CpuBinding]) { let config = resources[0].as_typed(); let scene = resources[1].as_slice(); let mut tag_monoids = resources[3].as_slice_mut(); - pathtag_scan_single_main(n_wg, &config, &scene, &mut tag_monoids); + pathtag_scan_main(n_wg, &config, &scene, &mut tag_monoids); } From cf24cdefea6d4dbd8542d117701acf061b699d89 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:23:44 -0700 Subject: [PATCH 08/15] Fix bug in fallback, remove unused permutations Change fallback reduce pattern to match scan pattern so last thread in workgroup has the correct aggregate in registers without need for additional barrier. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 4 ++-- vello_shaders/shader/permutations | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 0ca8ba7b9..38373e98c 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -280,8 +280,8 @@ fn main( sh_fallback[local_id.x] = f_agg; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); - if local_id.x + (1u << i) < WG_SIZE { - let index = local_id.x + (1u << i); + let index = local_id.x - (1u << i); + if index >= 0u { if red0 { f_agg[0] += sh_fallback[index][0]; } diff --git a/vello_shaders/shader/permutations b/vello_shaders/shader/permutations index 011541eb2..cde92c360 100644 --- a/vello_shaders/shader/permutations +++ b/vello_shaders/shader/permutations @@ -1,6 +1,3 @@ -pathtag_scan -+ pathtag_scan_large -+ pathtag_scan_small: small fine + fine_area + fine_msaa8: msaa msaa8 From a9786e1fed8b780bae10a00b730828154b5f4d3f Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:52:38 -0700 Subject: [PATCH 09/15] Further shader style fixes. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 38373e98c..3322b52bd 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -51,8 +51,8 @@ fn attempt_lookback( reduction_complete: ptr, inclusive_complete: ptr ){ - let payload: u32 = atomicLoad(&reduced[lookback_ix][member_ix]); - let flag_value: u32 = payload & FLAG_MASK; + let payload = atomicLoad(&reduced[lookback_ix][member_ix]); + let flag_value = payload & FLAG_MASK; if flag_value == FLAG_REDUCTION { *spin_count = 0u; *prev += payload >> 2u; @@ -99,16 +99,16 @@ fn main( sh_lock = LOCKED; } workgroupBarrier(); - let part_ix: u32 = sh_broadcast; + let part_ix = sh_broadcast; //Local Scan, Hillis-Steel/Kogge-Stone - let tag_word: u32 = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; - var agg: array = reduce_tag_arr(tag_word); + let tag_word = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; + var agg = reduce_tag_arr(tag_word); sh_scratch[local_id.x] = agg; - for (var i: u32 = 0u; i < LG_WG_SIZE; i += 1u) { + for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); if local_id.x >= 1u << i { - let other: array = sh_scratch[local_id.x - (1u << i)]; + let other = sh_scratch[local_id.x - (1u << i)]; agg[0] += other[0]; agg[1] += other[1]; agg[2] += other[2]; @@ -140,31 +140,31 @@ fn main( if part_ix != 0u { var lookback_ix = part_ix - 1u; - var inc0: bool = false; - var inc1: bool = false; - var inc2: bool = false; - var inc3: bool = false; - var inc4: bool = false; - - var prev0: u32 = 0u; - var prev1: u32 = 0u; - var prev2: u32 = 0u; - var prev3: u32 = 0u; - var prev4: u32 = 0u; + var inc0 = false; + var inc1 = false; + var inc2 = false; + var inc3 = false; + var inc4 = false; + + var prev0 = 0u; + var prev1 = 0u; + var prev2 = 0u; + var prev3 = 0u; + var prev4 = 0u; while(sh_lock == LOCKED){ workgroupBarrier(); - var red0: bool = false; - var red1: bool = false; - var red2: bool = false; - var red3: bool = false; - var red4: bool = false; + var red0 = false; + var red1 = false; + var red2 = false; + var red3 = false; + var red4 = false; //Lookback, with a single thread //Last thread in the workgroup has the complete aggregate if local_id.x == WG_SIZE - 1u { - for (var spin_count: u32 = 0u; spin_count < MAX_SPIN_COUNT; ) { + for (var spin_count = 0u; spin_count < MAX_SPIN_COUNT; ) { //TRANS_IX if !inc0 && !red0 { attempt_lookback( @@ -275,8 +275,8 @@ fn main( //Fallback Reduce //Is there an alternative to this besides a giant switch statement or //5 individual reductions? - let f_word: u32 = scene[config.pathtag_base + local_id.x + fallback_ix * WG_SIZE]; - var f_agg: array = reduce_tag_arr(f_word); + let f_word = scene[config.pathtag_base + local_id.x + fallback_ix * WG_SIZE]; + var f_agg = reduce_tag_arr(f_word); sh_fallback[local_id.x] = f_agg; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); @@ -413,7 +413,7 @@ fn main( } if local_id.x != 0u { - let other: array = sh_scratch[local_id.x - 1u]; + let other = sh_scratch[local_id.x - 1u]; tm[0] += other[0]; tm[1] += other[1]; tm[2] += other[2]; From b7652e49a034c6aedab1ba1f3732aaa1dd48fe1c Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Thu, 19 Sep 2024 19:54:39 -0700 Subject: [PATCH 10/15] Refactor shader to use loops. Change reduce_tag to marshaling function. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 424 ++++++------------ vello_shaders/shader/shared/pathtag.wgsl | 32 +- 2 files changed, 151 insertions(+), 305 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 3322b52bd..4028a3d94 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -11,10 +11,10 @@ var config: Config; var scene: array; @group(0) @binding(2) -var reduced: array, 5>>; +var reduced: array, PATH_MEMBERS>>; @group(0) @binding(3) -var tag_monoids: array>; +var tag_monoids: array>; @group(0) @binding(4) var scan_bump: atomic; @@ -24,69 +24,49 @@ let LG_WG_SIZE = 8u; let WG_SIZE = 256u; //For the decoupled lookback -let FLAG_NOT_READY: u32 = 0; -let FLAG_REDUCTION: u32 = 1; -let FLAG_INCLUSIVE: u32 = 2; -let FLAG_MASK: u32 = 3; +let FLAG_NOT_READY = 0u; +let FLAG_REDUCTION = 1u; +let FLAG_INCLUSIVE = 2u; +let FLAG_MASK = 3u; //For the decoupled fallback -let MAX_SPIN_COUNT: u32 = 4; -let LOCKED: u32 = 1; -let UNLOCKED: u32 = 0; +let MAX_SPIN_COUNT = 4u; +let LOCKED = 1u; +let UNLOCKED = 0u; var sh_broadcast: u32; var sh_lock: u32; -var sh_scratch: array, WG_SIZE>; -var sh_fallback: array, WG_SIZE>; -var sh_tag_broadcast: array; -var sh_fallback_state: array; +var sh_scratch: array, WG_SIZE>; +var sh_tag_broadcast: array; +var sh_fallback_state: array; -fn attempt_lookback( - part_ix: u32, - lookback_ix: u32, - member_ix: u32, - aggregate: u32, - spin_count: ptr, - prev: ptr, - reduction_complete: ptr, - inclusive_complete: ptr -){ - let payload = atomicLoad(&reduced[lookback_ix][member_ix]); - let flag_value = payload & FLAG_MASK; - if flag_value == FLAG_REDUCTION { - *spin_count = 0u; - *prev += payload >> 2u; - *reduction_complete = true; - } else if flag_value == FLAG_INCLUSIVE { - *spin_count = 0u; - *prev += payload >> 2u; - atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[member_ix] = *prev; - *inclusive_complete = true; - } +struct pathtag_wrapper{ + p: array } -fn fallback( - part_ix: u32, - fallback_ix: u32, - member_ix: u32, - aggregate: u32, - fallback_aggregate: u32, - prev: ptr, - inclusive_complete: ptr -){ - let fallback_payload = (fallback_aggregate << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); - let prev_payload = atomicMax(&reduced[fallback_ix][member_ix], fallback_payload); - if prev_payload == 0u { - *prev += fallback_aggregate; - } else { - *prev += prev_payload >> 2u; - } - if fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE { - atomicStore(&reduced[part_ix][member_ix], ((aggregate + *prev) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[member_ix] = *prev; - *inclusive_complete = true; - } +struct state_wrapper{ + s: array +} + +//TODO: There has to be a better way to initialize an array? +fn clear_pathtag()->array{ + var a: array; + a[0] = 0u; + a[1] = 0u; + a[2] = 0u; + a[3] = 0u; + a[4] = 0u; + return a; +} + +fn clear_state()->array{ + var a: array; + a[0] = false; + a[1] = false; + a[2] = false; + a[3] = false; + a[4] = false; + return a; } @compute @workgroup_size(256) @@ -103,145 +83,88 @@ fn main( //Local Scan, Hillis-Steel/Kogge-Stone let tag_word = scene[config.pathtag_base + local_id.x + part_ix * WG_SIZE]; - var agg = reduce_tag_arr(tag_word); - sh_scratch[local_id.x] = agg; + var agg: pathtag_wrapper; + agg.p = reduce_tag_arr(tag_word); + sh_scratch[local_id.x] = agg.p; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); if local_id.x >= 1u << i { - let other = sh_scratch[local_id.x - (1u << i)]; - agg[0] += other[0]; - agg[1] += other[1]; - agg[2] += other[2]; - agg[3] += other[3]; - agg[4] += other[4]; + var other: pathtag_wrapper; + other.p = sh_scratch[local_id.x - (1u << i)]; + for (var k = 0u; k < 5u; k += 1u){ + agg.p[k] += other.p[k]; + } } workgroupBarrier(); - sh_scratch[local_id.x] = agg; + if i < LG_WG_SIZE - 1u { + sh_scratch[local_id.x] = agg.p; + } } //Broadcast the results and flag into device memory if local_id.x == WG_SIZE - 1u { - if part_ix != 0u { - atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_REDUCTION); - atomicStore(&reduced[part_ix][1], (agg[1] << 2u) | FLAG_REDUCTION); - atomicStore(&reduced[part_ix][2], (agg[2] << 2u) | FLAG_REDUCTION); - atomicStore(&reduced[part_ix][3], (agg[3] << 2u) | FLAG_REDUCTION); - atomicStore(&reduced[part_ix][4], (agg[4] << 2u) | FLAG_REDUCTION); - } else { - atomicStore(&reduced[part_ix][0], (agg[0] << 2u) | FLAG_INCLUSIVE); - atomicStore(&reduced[part_ix][1], (agg[1] << 2u) | FLAG_INCLUSIVE); - atomicStore(&reduced[part_ix][2], (agg[2] << 2u) | FLAG_INCLUSIVE); - atomicStore(&reduced[part_ix][3], (agg[3] << 2u) | FLAG_INCLUSIVE); - atomicStore(&reduced[part_ix][4], (agg[4] << 2u) | FLAG_INCLUSIVE); + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + atomicStore(&reduced[part_ix][i], (agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, part_ix != 0u)); } } //Lookback and potentially fallback if part_ix != 0u { var lookback_ix = part_ix - 1u; - - var inc0 = false; - var inc1 = false; - var inc2 = false; - var inc3 = false; - var inc4 = false; - - var prev0 = 0u; - var prev1 = 0u; - var prev2 = 0u; - var prev3 = 0u; - var prev4 = 0u; + var inc_complete: state_wrapper; + inc_complete.s = clear_state(); + var prev_reduction: pathtag_wrapper; + prev_reduction.p = clear_pathtag(); while(sh_lock == LOCKED){ workgroupBarrier(); - - var red0 = false; - var red1 = false; - var red2 = false; - var red3 = false; - var red4 = false; + + var red_complete: state_wrapper; + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + red_complete.s[i] = false; + } //Lookback, with a single thread //Last thread in the workgroup has the complete aggregate if local_id.x == WG_SIZE - 1u { for (var spin_count = 0u; spin_count < MAX_SPIN_COUNT; ) { - //TRANS_IX - if !inc0 && !red0 { - attempt_lookback( - part_ix, - lookback_ix, - 0u, - agg[0], - &spin_count, - &prev0, - &red0, - &inc0); - } - - //PATHSEG_IX - if !inc1 && !red1 { - attempt_lookback( - part_ix, - lookback_ix, - 1u, - agg[1], - &spin_count, - &prev1, - &red1, - &inc1); - } - - //PATHSEG_OFFSET - if !inc2 && !red2 { - attempt_lookback( - part_ix, - lookback_ix, - 2u, - agg[2], - &spin_count, - &prev2, - &red2, - &inc2); + //Attempt Lookback + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + if !inc_complete.s[i] && !red_complete.s[i] { + let payload = atomicLoad(&reduced[lookback_ix][i]); + let flag_value = payload & FLAG_MASK; + if flag_value == FLAG_REDUCTION { + spin_count = 0u; + prev_reduction.p[i] += payload >> 2u; + red_complete.s[i] = true; + } else if flag_value == FLAG_INCLUSIVE { + spin_count = 0u; + prev_reduction.p[i] += payload >> 2u; + atomicStore(&reduced[part_ix][i], ((agg.p[i] + prev_reduction.p[i]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[i] = prev_reduction.p[i]; + inc_complete.s[i] = true; + } + } } - //STYLE_IX - if !inc3 && !red3 { - attempt_lookback( - part_ix, - lookback_ix, - 3u, - agg[3], - &spin_count, - &prev3, - &red3, - &inc3); - } - - //PATH_IX - if !inc4 && !red4 { - attempt_lookback( - part_ix, - lookback_ix, - 4u, - agg[4], - &spin_count, - &prev4, - &red4, - &inc4); + //Have we completed the current reduction or inclusive sum for all PathTag members? + var can_advance = inc_complete.s[0] || red_complete.s[0]; + for (var i = 1u; i < PATH_MEMBERS; i += 1u) { + can_advance = can_advance && (inc_complete.s[i] || red_complete.s[i]); } - //Have we completed the current reduction or inclusive sum for all PathTag members? - if (inc0 || red0) && (inc1 || red1) && (inc2 || red2) && (inc3 || red3) && (inc4 || red4) { - if inc0 && inc1 && inc2 && inc3 && inc4 { + if can_advance { + //Are all lookbacks complete? + var all_complete = inc_complete.s[0]; + for (var i = 1u; i < PATH_MEMBERS; i += 1u) { + all_complete = all_complete && inc_complete.s[i]; + } + if all_complete { sh_lock = UNLOCKED; break; } else { lookback_ix--; - red0 = false; - red1 = false; - red2 = false; - red3 = false; - red4 = false; + red_complete.s = clear_state(); } } else { spin_count++; @@ -253,11 +176,9 @@ fn main( //and states of the tagmonoid struct members if sh_lock == LOCKED { sh_broadcast = lookback_ix; - sh_fallback_state[0] = !inc0 && !red0; - sh_fallback_state[1] = !inc1 && !red1; - sh_fallback_state[2] = !inc2 && !red2; - sh_fallback_state[3] = !inc3 && !red3; - sh_fallback_state[4] = !inc4 && !red4; + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + sh_fallback_state[i] = !inc_complete.s[i] && !red_complete.s[i]; + } } } workgroupBarrier(); @@ -265,131 +186,62 @@ fn main( //Fallback if sh_lock == LOCKED { let fallback_ix = sh_broadcast; - - red0 = sh_fallback_state[0]; - red1 = sh_fallback_state[1]; - red2 = sh_fallback_state[2]; - red3 = sh_fallback_state[3]; - red4 = sh_fallback_state[4]; + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + red_complete.s[i] = sh_fallback_state[i]; + } //Fallback Reduce //Is there an alternative to this besides a giant switch statement or //5 individual reductions? let f_word = scene[config.pathtag_base + local_id.x + fallback_ix * WG_SIZE]; - var f_agg = reduce_tag_arr(f_word); - sh_fallback[local_id.x] = f_agg; + var f_agg: pathtag_wrapper; + f_agg.p = reduce_tag_arr(f_word); + sh_scratch[local_id.x] = f_agg.p; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); let index = local_id.x - (1u << i); if index >= 0u { - if red0 { - f_agg[0] += sh_fallback[index][0]; - } - if red1 { - f_agg[1] += sh_fallback[index][1]; - } - if red2 { - f_agg[2] += sh_fallback[index][2]; - } - if red3 { - f_agg[3] += sh_fallback[index][3]; - } - if red4 { - f_agg[4] += sh_fallback[index][4]; + for (var k = 0u; k < PATH_MEMBERS; k += 1u) { + if red_complete.s[k] { + f_agg.p[k] += sh_scratch[index][k]; + } } } workgroupBarrier(); - if red0 { - sh_fallback[local_id.x][0] = f_agg[0]; - } - - if red1 { - sh_fallback[local_id.x][1] = f_agg[1]; - } - - if red2 { - sh_fallback[local_id.x][2] = f_agg[2]; - } - - if red3 { - sh_fallback[local_id.x][3] = f_agg[3]; - } - - if red4 { - sh_fallback[local_id.x][4] = f_agg[4]; + if i < LG_WG_SIZE - 1u { + for (var k = 0u; k < PATH_MEMBERS; k += 1u) { + if red_complete.s[k] { + sh_scratch[local_id.x][k] = f_agg.p[k]; + } + } } } //Fallback and attempt insertion of status flag if local_id.x == WG_SIZE - 1u { - //TRANS_IX FALLBACK - if red0 { - fallback( - part_ix, - fallback_ix, - 0u, - agg[0], - f_agg[0], - &prev0, - &inc0, - ); - } - - //PATHSEG_IX FALLBACK - if red1 { - fallback( - part_ix, - fallback_ix, - 1u, - agg[1], - f_agg[1], - &prev1, - &inc1, - ); - } - - //PATHSEG_OFFSET FALLBACK - if red2 { - fallback( - part_ix, - fallback_ix, - 2u, - agg[2], - f_agg[2], - &prev2, - &inc2, - ); - } - - //STYLE_IX FALLBACK - if red3 { - fallback( - part_ix, - fallback_ix, - 3u, - agg[3], - f_agg[3], - &prev3, - &inc3, - ); - } - - //PATH_IX FALLBACK - if red4 { - fallback( - part_ix, - fallback_ix, - 4u, - agg[4], - f_agg[4], - &prev4, - &inc4, - ); + //Fallback + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + let fallback_payload = (f_agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); + let prev_payload = atomicMax(&reduced[fallback_ix][i], fallback_payload); + if prev_payload == 0u { + prev_reduction.p[i] += f_agg.p[i]; + } else { + prev_reduction.p[i] += prev_payload >> 2u; + } + if fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE { + atomicStore(&reduced[part_ix][i], ((agg.p[i] + prev_reduction.p[i]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[i] = prev_reduction.p[i]; + inc_complete.s[i] = true; + } } //At this point, the reductions are guaranteed to be complete, //so try unlocking, else, keep looking back - if inc0 && inc1 && inc2 && inc3 && inc4 { + var all_complete = inc_complete.s[0]; + for (var i = 1u; i < PATH_MEMBERS; i += 1u) { + all_complete = all_complete && inc_complete.s[i]; + } + if all_complete { sh_lock = UNLOCKED; } else { lookback_ix--; @@ -399,27 +251,23 @@ fn main( } } } + sh_scratch[local_id.x] = agg.p; workgroupBarrier(); - var tm: array; + var tm: pathtag_wrapper; if part_ix != 0u { - tm = sh_tag_broadcast; + tm.p = sh_tag_broadcast; } else { - tm[0] = 0u; - tm[1] = 0u; - tm[2] = 0u; - tm[3] = 0u; - tm[4] = 0u; + tm.p = clear_pathtag(); } if local_id.x != 0u { - let other = sh_scratch[local_id.x - 1u]; - tm[0] += other[0]; - tm[1] += other[1]; - tm[2] += other[2]; - tm[3] += other[3]; - tm[4] += other[4]; + var other: pathtag_wrapper; + other.p = sh_scratch[local_id.x - 1u]; + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + tm.p[i] += other.p[i]; + } } - tag_monoids[local_id.x + part_ix * WG_SIZE] = tm; + tag_monoids[local_id.x + part_ix * WG_SIZE] = tm.p; } diff --git a/vello_shaders/shader/shared/pathtag.wgsl b/vello_shaders/shader/shared/pathtag.wgsl index aa4dc93ad..1855e5335 100644 --- a/vello_shaders/shader/shared/pathtag.wgsl +++ b/vello_shaders/shader/shared/pathtag.wgsl @@ -10,6 +10,9 @@ struct TagMonoid { path_ix: u32, } +//The number of members in the TagMonoid struct +let PATH_MEMBERS = 5u; + let PATH_TAG_SEG_TYPE = 3u; let PATH_TAG_LINETO = 1u; let PATH_TAG_QUADTO = 2u; @@ -55,24 +58,8 @@ fn combine_tag_monoid(a: TagMonoid, b: TagMonoid) -> TagMonoid { return c; } -fn reduce_tag(tag_word: u32) -> TagMonoid { - var c: TagMonoid; - let point_count = tag_word & 0x3030303u; - c.pathseg_ix = countOneBits((point_count * 7u) & 0x4040404u); - c.trans_ix = countOneBits(tag_word & (PATH_TAG_TRANSFORM * 0x1010101u)); - let n_points = point_count + ((tag_word >> 2u) & 0x1010101u); - var a = n_points + (n_points & (((tag_word >> 3u) & 0x1010101u) * 15u)); - a += a >> 8u; - a += a >> 16u; - c.pathseg_offset = a & 0xffu; - c.path_ix = countOneBits(tag_word & (PATH_TAG_PATH * 0x1010101u)); - c.style_ix = countOneBits(tag_word & (PATH_TAG_STYLE * 0x1010101u)) * STYLE_SIZE_IN_WORDS; - return c; -} - -//An alternate version for the scan, using array instead of TagMonoid fn reduce_tag_arr(tag_word: u32) -> array { - var c: array; + var c: array; let point_count = tag_word & 0x3030303u; c[1] = countOneBits((point_count * 7u) & 0x4040404u); c[0] = countOneBits(tag_word & (PATH_TAG_TRANSFORM * 0x1010101u)); @@ -85,3 +72,14 @@ fn reduce_tag_arr(tag_word: u32) -> array { c[3] = countOneBits(tag_word & (PATH_TAG_STYLE * 0x1010101u)) * STYLE_SIZE_IN_WORDS; return c; } + +fn reduce_tag(tag_word: u32) -> TagMonoid { + let r = reduce_tag_arr(tag_word); + var c: TagMonoid; + c.trans_ix = r[0]; + c.pathseg_ix = r[1]; + c.pathseg_offset = r[2]; + c.style_ix = r[3]; + c.path_ix = r[4]; + return c; +} From 99ac32b28a551dd7ffa2bc8db2472761cfc1db08 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:16:55 -0700 Subject: [PATCH 11/15] Improve lookback logic, update pathtag_scan_single.rs to pathtag_scan.rs --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 8 +++----- vello_shaders/src/cpu.rs | 4 ++-- .../src/cpu/{pathtag_scan_single.rs => pathtag_scan.rs} | 0 3 files changed, 5 insertions(+), 7 deletions(-) rename vello_shaders/src/cpu/{pathtag_scan_single.rs => pathtag_scan.rs} (100%) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 4028a3d94..f6536e182 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -129,6 +129,7 @@ fn main( if local_id.x == WG_SIZE - 1u { for (var spin_count = 0u; spin_count < MAX_SPIN_COUNT; ) { //Attempt Lookback + var can_advance = true; for (var i = 0u; i < PATH_MEMBERS; i += 1u) { if !inc_complete.s[i] && !red_complete.s[i] { let payload = atomicLoad(&reduced[lookback_ix][i]); @@ -143,16 +144,13 @@ fn main( atomicStore(&reduced[part_ix][i], ((agg.p[i] + prev_reduction.p[i]) << 2u) | FLAG_INCLUSIVE); sh_tag_broadcast[i] = prev_reduction.p[i]; inc_complete.s[i] = true; + } else { + can_advance = false; } } } //Have we completed the current reduction or inclusive sum for all PathTag members? - var can_advance = inc_complete.s[0] || red_complete.s[0]; - for (var i = 1u; i < PATH_MEMBERS; i += 1u) { - can_advance = can_advance && (inc_complete.s[i] || red_complete.s[i]); - } - if can_advance { //Are all lookbacks complete? var all_complete = inc_complete.s[0]; diff --git a/vello_shaders/src/cpu.rs b/vello_shaders/src/cpu.rs index 5df1a52cb..4309b505d 100644 --- a/vello_shaders/src/cpu.rs +++ b/vello_shaders/src/cpu.rs @@ -25,7 +25,7 @@ mod path_count; mod path_count_setup; mod path_tiling; mod path_tiling_setup; -mod pathtag_scan_single; +mod pathtag_scan; mod tile_alloc; mod util; @@ -42,7 +42,7 @@ pub use path_count::path_count; pub use path_count_setup::path_count_setup; pub use path_tiling::path_tiling; pub use path_tiling_setup::path_tiling_setup; -pub use pathtag_scan_single::pathtag_scan; +pub use pathtag_scan::pathtag_scan; pub use tile_alloc::tile_alloc; use std::cell::{Ref, RefCell, RefMut}; diff --git a/vello_shaders/src/cpu/pathtag_scan_single.rs b/vello_shaders/src/cpu/pathtag_scan.rs similarity index 100% rename from vello_shaders/src/cpu/pathtag_scan_single.rs rename to vello_shaders/src/cpu/pathtag_scan.rs From dc1e4b4f9e5214b9c08ef93f45e67bb23af53819 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Fri, 20 Sep 2024 08:29:32 -0700 Subject: [PATCH 12/15] Style fixes, fix u32 bug, force fallback to check correctness. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index f6536e182..3f8680904 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -48,25 +48,12 @@ struct state_wrapper{ s: array } -//TODO: There has to be a better way to initialize an array? fn clear_pathtag()->array{ - var a: array; - a[0] = 0u; - a[1] = 0u; - a[2] = 0u; - a[3] = 0u; - a[4] = 0u; - return a; + return array(0u, 0u, 0u, 0u, 0u); } fn clear_state()->array{ - var a: array; - a[0] = false; - a[1] = false; - a[2] = false; - a[3] = false; - a[4] = false; - return a; + return array(false, false, false, false, false); } @compute @workgroup_size(256) @@ -104,7 +91,7 @@ fn main( //Broadcast the results and flag into device memory if local_id.x == WG_SIZE - 1u { for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - atomicStore(&reduced[part_ix][i], (agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, part_ix != 0u)); + //atomicStore(&reduced[part_ix][i], (agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, part_ix != 0u)); } } @@ -112,7 +99,7 @@ fn main( if part_ix != 0u { var lookback_ix = part_ix - 1u; var inc_complete: state_wrapper; - inc_complete.s = clear_state(); + inc_complete.s = array(false, false, false, false, false); var prev_reduction: pathtag_wrapper; prev_reduction.p = clear_pathtag(); @@ -197,8 +184,8 @@ fn main( sh_scratch[local_id.x] = f_agg.p; for (var i = 0u; i < LG_WG_SIZE; i += 1u) { workgroupBarrier(); - let index = local_id.x - (1u << i); - if index >= 0u { + let index = i32(local_id.x) - i32(1u << i); + if index >= 0 { for (var k = 0u; k < PATH_MEMBERS; k += 1u) { if red_complete.s[k] { f_agg.p[k] += sh_scratch[index][k]; From 5ca145be24f2eeb4dcc6ca039f2668caf112adeb Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Fri, 20 Sep 2024 08:37:21 -0700 Subject: [PATCH 13/15] Fix incorrect transcription of fallback function. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 3f8680904..be610b8d4 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -91,7 +91,7 @@ fn main( //Broadcast the results and flag into device memory if local_id.x == WG_SIZE - 1u { for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - //atomicStore(&reduced[part_ix][i], (agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, part_ix != 0u)); + atomicStore(&reduced[part_ix][i], (agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, part_ix != 0u)); } } @@ -206,17 +206,19 @@ fn main( if local_id.x == WG_SIZE - 1u { //Fallback for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - let fallback_payload = (f_agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); - let prev_payload = atomicMax(&reduced[fallback_ix][i], fallback_payload); - if prev_payload == 0u { - prev_reduction.p[i] += f_agg.p[i]; - } else { - prev_reduction.p[i] += prev_payload >> 2u; - } - if fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE { - atomicStore(&reduced[part_ix][i], ((agg.p[i] + prev_reduction.p[i]) << 2u) | FLAG_INCLUSIVE); - sh_tag_broadcast[i] = prev_reduction.p[i]; - inc_complete.s[i] = true; + if red_complete.s[i] { + let fallback_payload = (f_agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); + let prev_payload = atomicMax(&reduced[fallback_ix][i], fallback_payload); + if prev_payload == 0u { + prev_reduction.p[i] += f_agg.p[i]; + } else { + prev_reduction.p[i] += prev_payload >> 2u; + } + if fallback_ix == 0u || (prev_payload & FLAG_MASK) == FLAG_INCLUSIVE { + atomicStore(&reduced[part_ix][i], ((agg.p[i] + prev_reduction.p[i]) << 2u) | FLAG_INCLUSIVE); + sh_tag_broadcast[i] = prev_reduction.p[i]; + inc_complete.s[i] = true; + } } } From 54f1472f60dc6212bfc28c9ec5411be4e6e72367 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Fri, 20 Sep 2024 08:50:18 -0700 Subject: [PATCH 14/15] Rename state_wrapper during fallback for clarity --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index be610b8d4..369282eca 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -99,21 +99,20 @@ fn main( if part_ix != 0u { var lookback_ix = part_ix - 1u; var inc_complete: state_wrapper; - inc_complete.s = array(false, false, false, false, false); + inc_complete.s = clear_state(); var prev_reduction: pathtag_wrapper; prev_reduction.p = clear_pathtag(); while(sh_lock == LOCKED){ workgroupBarrier(); - var red_complete: state_wrapper; - for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - red_complete.s[i] = false; - } - //Lookback, with a single thread //Last thread in the workgroup has the complete aggregate if local_id.x == WG_SIZE - 1u { + var red_complete: state_wrapper; + for (var i = 0u; i < PATH_MEMBERS; i += 1u) { + red_complete.s[i] = false; + } for (var spin_count = 0u; spin_count < MAX_SPIN_COUNT; ) { //Attempt Lookback var can_advance = true; @@ -171,8 +170,9 @@ fn main( //Fallback if sh_lock == LOCKED { let fallback_ix = sh_broadcast; + var should_fallback: state_wrapper; for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - red_complete.s[i] = sh_fallback_state[i]; + should_fallback.s[i] = sh_fallback_state[i]; } //Fallback Reduce @@ -187,7 +187,7 @@ fn main( let index = i32(local_id.x) - i32(1u << i); if index >= 0 { for (var k = 0u; k < PATH_MEMBERS; k += 1u) { - if red_complete.s[k] { + if should_fallback.s[k] { f_agg.p[k] += sh_scratch[index][k]; } } @@ -195,7 +195,7 @@ fn main( workgroupBarrier(); if i < LG_WG_SIZE - 1u { for (var k = 0u; k < PATH_MEMBERS; k += 1u) { - if red_complete.s[k] { + if should_fallback.s[k] { sh_scratch[local_id.x][k] = f_agg.p[k]; } } @@ -206,7 +206,7 @@ fn main( if local_id.x == WG_SIZE - 1u { //Fallback for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - if red_complete.s[i] { + if should_fallback.s[i] { let fallback_payload = (f_agg.p[i] << 2u) | select(FLAG_INCLUSIVE, FLAG_REDUCTION, fallback_ix != 0u); let prev_payload = atomicMax(&reduced[fallback_ix][i], fallback_payload); if prev_payload == 0u { From 296bbbf2cd60b45b386da93a6a3c28361bf03fb0 Mon Sep 17 00:00:00 2001 From: Thomas Smith <68340554+b0nes164@users.noreply.github.com> Date: Sun, 22 Sep 2024 21:22:32 -0700 Subject: [PATCH 15/15] Change fallback broadcast to register based. --- vello_shaders/shader/pathtag_scan_csdldf.wgsl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vello_shaders/shader/pathtag_scan_csdldf.wgsl b/vello_shaders/shader/pathtag_scan_csdldf.wgsl index 369282eca..3381346e7 100644 --- a/vello_shaders/shader/pathtag_scan_csdldf.wgsl +++ b/vello_shaders/shader/pathtag_scan_csdldf.wgsl @@ -110,12 +110,11 @@ fn main( //Last thread in the workgroup has the complete aggregate if local_id.x == WG_SIZE - 1u { var red_complete: state_wrapper; - for (var i = 0u; i < PATH_MEMBERS; i += 1u) { - red_complete.s[i] = false; - } + red_complete.s = clear_state(); + var can_advance: bool; for (var spin_count = 0u; spin_count < MAX_SPIN_COUNT; ) { //Attempt Lookback - var can_advance = true; + can_advance = true; for (var i = 0u; i < PATH_MEMBERS; i += 1u) { if !inc_complete.s[i] && !red_complete.s[i] { let payload = atomicLoad(&reduced[lookback_ix][i]); @@ -158,7 +157,7 @@ fn main( //If we didn't complete the lookback within the allotted spins, //prepare for the fallback by broadcasting the lookback tile id //and states of the tagmonoid struct members - if sh_lock == LOCKED { + if !can_advance { sh_broadcast = lookback_ix; for (var i = 0u; i < PATH_MEMBERS; i += 1u) { sh_fallback_state[i] = !inc_complete.s[i] && !red_complete.s[i];