Skip to content

Commit

Permalink
[re_renderer] Uniform buffer utility using CpuWriteGpuReadBelt (#1400)
Browse files Browse the repository at this point in the history
* utility function for uniform buffer from struct
* Uniform buffers are now compile time forced to be 256 bytes aligned
* CpuWriteGpuReadBelt no longer takes mutable buffer pool
* Renderers and frame_global_command_encoder are now behind locks
---------

Co-authored-by: Clement Rey <[email protected]>
  • Loading branch information
2 people authored and emilk committed Mar 2, 2023
1 parent 7f81b4f commit 77b7eea
Show file tree
Hide file tree
Showing 19 changed files with 322 additions and 253 deletions.
11 changes: 6 additions & 5 deletions crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ where
/// Do *not* make this public as we need to guarantee that the memory is *never* read from!
#[inline(always)]
fn as_slice(&mut self) -> &mut [T] {
// TODO(andreas): Is this access slow given that it internally goes through a trait interface? Should we keep the pointer around?
&mut bytemuck::cast_slice_mut(&mut self.write_view)[self.unwritten_element_range.clone()]
}

Expand All @@ -47,13 +48,13 @@ where
/// Panics if the data no longer fits into the buffer.
#[inline]
pub fn extend_from_slice(&mut self, elements: &[T]) {
self.as_slice().copy_from_slice(elements);
self.as_slice()[..elements.len()].copy_from_slice(elements);
self.unwritten_element_range.start += elements.len();
}

/// Pushes several elements into the buffer.
///
/// Panics if the data no longer fits into the buffer.
/// Extends until either running out of space or elements.
#[inline]
pub fn extend(&mut self, elements: impl Iterator<Item = T>) {
let mut num_elements = 0;
Expand All @@ -68,8 +69,8 @@ where
///
/// Panics if the data no longer fits into the buffer.
#[inline]
pub fn push(&mut self, element: &T) {
self.as_slice()[0] = *element;
pub fn push(&mut self, element: T) {
self.as_slice()[0] = element;
self.unwritten_element_range.start += 1;
}

Expand Down Expand Up @@ -285,7 +286,7 @@ impl CpuWriteGpuReadBelt {
pub fn allocate<T: bytemuck::Pod>(
&mut self,
device: &wgpu::Device,
buffer_pool: &mut GpuBufferPool,
buffer_pool: &GpuBufferPool,
num_elements: usize,
) -> CpuWriteGpuReadBuffer<T> {
// Potentially overestimate alignment with Self::MIN_ALIGNMENT, see Self::MIN_ALIGNMENT doc string.
Expand Down
4 changes: 4 additions & 0 deletions crates/re_renderer/src/allocator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,9 @@
//! follows some more complex strategy for efficient re-use and sub-allocation of wgpu resources.
mod cpu_write_gpu_read_belt;
mod uniform_buffer_fill;

pub use cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};
pub use uniform_buffer_fill::{
create_and_fill_uniform_buffer, create_and_fill_uniform_buffer_batch,
};
102 changes: 102 additions & 0 deletions crates/re_renderer/src/allocator/uniform_buffer_fill.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
pub use super::cpu_write_gpu_read_belt::{CpuWriteGpuReadBelt, CpuWriteGpuReadBuffer};

use crate::{wgpu_resources::BindGroupEntry, DebugLabel, RenderContext};

struct UniformBufferAlignmentCheck<T> {
pub _marker: std::marker::PhantomData<T>,
}
impl<T> UniformBufferAlignmentCheck<T> {
/// wgpu requires uniform buffers to be aligned to up to 256 bytes.
///
/// This is a property of device limits, see [`WebGPU` specification](https://www.w3.org/TR/webgpu/#limits).
/// Implementations are allowed to advertise a lower alignment requirement, however
/// 256 bytes is fairly common even in modern hardware and is even hardcoded for DX12.
///
/// Technically this is only relevant when sub-allocating a buffer, as the wgpu backend
/// is internally forced to make sure that the start of any [`wgpu::Buffer`] with [`wgpu::BufferUsages::UNIFORM`] usage
/// has this alignment. Practically, ensuring this alignment everywhere
///
/// Alternatively to enforcing this alignment on the type we could:
/// * only align on the gpu buffer
/// -> causes more fine grained `copy_buffer_to_buffer` calls on the gpu encoder
/// * only align on the [`CpuWriteGpuReadBuffer`] & gpu buffer
/// -> causes more complicated offset computation on [`CpuWriteGpuReadBuffer`] as well as either
/// holes at padding (-> undefined values & slow for write combined!) or complicated nulling of padding
///
/// About the [`bytemuck::Pod`] requirement (dragged in by [`CpuWriteGpuReadBuffer`]):
/// [`bytemuck::Pod`] forces us to be explicit about padding as it doesn't allow invisible padding bytes!
/// We could drop this and thus make it easier to define uniform buffer types.
/// But this leads to more unsafe code, harder to avoid holes in write combined memory access
/// and potentially undefined values in the padding bytes on GPU.
const CHECK: () = assert!(
std::mem::align_of::<T>() >= 256,
"Uniform buffers need to be aligned to 256 bytes. Use `#[repr(C, align(256))]`"
);
}
/// Utility for fast & efficient creation of uniform buffers from a series of structs.
///
/// For subsequent frames, this will automatically not allocate any resources (thanks to our buffer pooling mechanism).
///
/// TODO(#1383): We could do this on a more complex stack allocator.
pub fn create_and_fill_uniform_buffer_batch<T: bytemuck::Pod>(
ctx: &RenderContext,
label: DebugLabel,
content: impl ExactSizeIterator<Item = T>,
) -> Vec<BindGroupEntry> {
#[allow(clippy::let_unit_value)]
let _ = UniformBufferAlignmentCheck::<T>::CHECK;

let num_buffers = content.len() as u64;
let element_size = std::mem::size_of::<T>() as u64;

assert!(
element_size > 0,
"Uniform buffer need to have a non-zero size"
);

let buffer = ctx.gpu_resources.buffers.alloc(
&ctx.device,
&crate::wgpu_resources::BufferDesc {
label,
size: num_buffers * element_size,
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
},
);

let mut staging_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<T>(
&ctx.device,
&ctx.gpu_resources.buffers,
num_buffers as _,
);
staging_buffer.extend(content);
staging_buffer.copy_to_buffer(
ctx.active_frame
.frame_global_command_encoder
.lock()
.get_or_create(&ctx.device),
&buffer,
0,
);

(0..num_buffers)
.into_iter()
.map(|i| BindGroupEntry::Buffer {
handle: buffer.handle,
offset: i * element_size,
size: Some(std::num::NonZeroU64::new(element_size).unwrap()),
})
.collect()
}

/// See [`create_and_fill_uniform_buffer`].
pub fn create_and_fill_uniform_buffer<T: bytemuck::Pod>(
ctx: &mut RenderContext,
label: DebugLabel,
content: T,
) -> BindGroupEntry {
create_and_fill_uniform_buffer_batch(ctx, label, std::iter::once(content))
.into_iter()
.next()
.unwrap()
}
66 changes: 40 additions & 26 deletions crates/re_renderer/src/context.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Arc;

use parking_lot::Mutex;
use parking_lot::{Mutex, RwLock};
use type_map::concurrent::{self, TypeMap};

use crate::{
Expand All @@ -20,7 +20,7 @@ pub struct RenderContext {
pub queue: Arc<wgpu::Queue>,

pub(crate) shared_renderer_data: SharedRendererData,
pub(crate) renderers: Renderers,
pub(crate) renderers: RwLock<Renderers>,
pub(crate) resolver: RecommendedFileResolver,
#[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,
Expand Down Expand Up @@ -155,14 +155,14 @@ impl RenderContext {
};

let mut resolver = crate::new_recommended_file_resolver();
let mut renderers = Renderers {
let mut renderers = RwLock::new(Renderers {
renderers: TypeMap::new(),
};
});

let mesh_manager = MeshManager::new(
device.clone(),
queue.clone(),
renderers.get_or_create(
renderers.get_mut().get_or_create(
&shared_renderer_data,
&mut gpu_resources,
&device,
Expand Down Expand Up @@ -193,8 +193,11 @@ impl RenderContext {

inflight_queue_submissions: Vec::new(),

active_frame: ActiveFrameContext { frame_global_command_encoder: None, per_frame_data_helper: TypeMap::new(),
frame_index: 0 }
active_frame: ActiveFrameContext {
frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
per_frame_data_helper: TypeMap::new(),
frame_index: 0
}
}
}

Expand Down Expand Up @@ -229,7 +232,7 @@ impl RenderContext {
self.cpu_write_gpu_read_belt.lock().after_queue_submit();

self.active_frame = ActiveFrameContext {
frame_global_command_encoder: None,
frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
frame_index: self.active_frame.frame_index + 1,
per_frame_data_helper: TypeMap::new(),
};
Expand Down Expand Up @@ -303,7 +306,13 @@ impl RenderContext {
// Unmap all staging buffers.
self.cpu_write_gpu_read_belt.lock().before_queue_submit();

if let Some(command_encoder) = self.active_frame.frame_global_command_encoder.take() {
if let Some(command_encoder) = self
.active_frame
.frame_global_command_encoder
.lock()
.0
.take()
{
let command_buffer = command_encoder.finish();

// TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
Expand All @@ -318,18 +327,38 @@ impl Drop for RenderContext {
fn drop(&mut self) {
// Close global command encoder if there is any pending.
// Not doing so before shutdown causes errors.
if let Some(encoder) = self.active_frame.frame_global_command_encoder.take() {
if let Some(encoder) = self
.active_frame
.frame_global_command_encoder
.lock()
.0
.take()
{
encoder.finish();
}
}
}

pub struct FrameGlobalCommandEncoder(Option<wgpu::CommandEncoder>);

impl FrameGlobalCommandEncoder {
/// Gets or creates a command encoder that runs before all view builder encoder.
pub fn get_or_create(&mut self, device: &wgpu::Device) -> &mut wgpu::CommandEncoder {
self.0.get_or_insert_with(|| {
device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
.get(),
})
})
}
}

pub struct ActiveFrameContext {
/// Command encoder for all commands that should go in before view builder are submitted.
///
/// This should be used for any gpu copy operation outside of a renderer or view builder.
/// (i.e. typically in [`crate::renderer::DrawData`] creation!)
frame_global_command_encoder: Option<wgpu::CommandEncoder>,
pub frame_global_command_encoder: Mutex<FrameGlobalCommandEncoder>,

/// Utility type map that will be cleared every frame.
pub per_frame_data_helper: TypeMap,
Expand All @@ -338,21 +367,6 @@ pub struct ActiveFrameContext {
frame_index: u64,
}

impl ActiveFrameContext {
/// Gets or creates a command encoder that runs before all view builder encoder.
pub fn frame_global_command_encoder(
&mut self,
device: &wgpu::Device,
) -> &mut wgpu::CommandEncoder {
self.frame_global_command_encoder.get_or_insert_with(|| {
device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
.get(),
})
})
}
}

/// Gets allocation size for a uniform buffer padded in a way that multiple can be put in a single wgpu buffer.
///
/// TODO(andreas): Once we have higher level buffer allocators this should be handled there.
Expand Down
15 changes: 5 additions & 10 deletions crates/re_renderer/src/global_bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{
wgpu_buffer_types,
wgpu_resources::{
BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
GpuBuffer, GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
GpuSamplerHandle, SamplerDesc, WgpuResourcePools,
},
};

Expand All @@ -13,7 +13,7 @@ use smallvec::smallvec;
///
/// Contains information that is constant for a single frame like camera.
/// (does not contain information that is special to a particular renderer)
#[repr(C)]
#[repr(C, align(256))]
#[derive(Clone, Copy, Zeroable, Pod)]
pub(crate) struct FrameUniformBuffer {
pub view_from_world: wgpu_buffer_types::Mat4x3,
Expand Down Expand Up @@ -46,8 +46,7 @@ pub(crate) struct FrameUniformBuffer {
pub auto_size_lines: f32,

/// Factor used to compute depth offsets, see `depth_offset.wgsl`.
pub depth_offset_factor: f32,
pub _padding: glam::Vec3,
pub depth_offset_factor: wgpu_buffer_types::F32RowPadded,
}

pub(crate) struct GlobalBindings {
Expand Down Expand Up @@ -128,19 +127,15 @@ impl GlobalBindings {
&self,
pools: &mut WgpuResourcePools,
device: &wgpu::Device,
frame_uniform_buffer: &GpuBuffer,
frame_uniform_buffer_binding: BindGroupEntry,
) -> GpuBindGroup {
pools.bind_groups.alloc(
device,
// Needs to be kept in sync with `global_bindings.wgsl` / `self.layout`
&BindGroupDesc {
label: "global bind group".into(),
entries: smallvec![
BindGroupEntry::Buffer {
handle: frame_uniform_buffer.handle,
offset: 0,
size: None,
},
frame_uniform_buffer_binding,
BindGroupEntry::Sampler(self.nearest_neighbor_sampler),
BindGroupEntry::Sampler(self.trilinear_sampler),
],
Expand Down
3 changes: 2 additions & 1 deletion crates/re_renderer/src/mesh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,9 @@ pub(crate) mod gpu_data {
}

impl GpuMesh {
// TODO(andreas): Take read-only context here and make uploads happen on staging belt.
pub fn new(
pools: &mut WgpuResourcePools,
pools: &WgpuResourcePools,
texture_manager: &TextureManager2D,
mesh_bound_group_layout: GpuBindGroupLayoutHandle,
device: &wgpu::Device,
Expand Down
4 changes: 2 additions & 2 deletions crates/re_renderer/src/point_cloud_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ where
// TODO(andreas): Be more resourceful about the size allocated here. Typically we know in advance!
let color_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate::<Color32>(
&ctx.device,
&mut ctx.gpu_resources.buffers,
&ctx.gpu_resources.buffers,
PointCloudDrawData::MAX_NUM_POINTS,
);

Expand Down Expand Up @@ -286,7 +286,7 @@ where
/// This mustn't call this more than once.
#[inline]
pub fn color(self, color: Color32) -> Self {
self.color.push(&color);
self.color.push(color);
self
}

Expand Down
3 changes: 2 additions & 1 deletion crates/re_renderer/src/renderer/compositor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ impl DrawData for CompositorDrawData {
impl CompositorDrawData {
pub fn new(ctx: &mut RenderContext, target: &GpuTexture) -> Self {
let pools = &mut ctx.gpu_resources;
let compositor = ctx.renderers.get_or_create::<_, Compositor>(
let mut renderers = ctx.renderers.write();
let compositor = renderers.get_or_create::<_, Compositor>(
&ctx.shared_renderer_data,
pools,
&ctx.device,
Expand Down
2 changes: 1 addition & 1 deletion crates/re_renderer/src/renderer/generic_skybox.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ impl DrawData for GenericSkyboxDrawData {

impl GenericSkyboxDrawData {
pub fn new(ctx: &mut RenderContext) -> Self {
ctx.renderers.get_or_create::<_, GenericSkybox>(
ctx.renderers.write().get_or_create::<_, GenericSkybox>(
&ctx.shared_renderer_data,
&mut ctx.gpu_resources,
&ctx.device,
Expand Down
Loading

0 comments on commit 77b7eea

Please sign in to comment.