Skip to content

Commit

Permalink
Use write belt for mesh data gpu upload (#1416)
Browse files Browse the repository at this point in the history
* Mesh renderer now takes a constant ctx
* mesh data upload works now entirely via CpuWriteGpuReadBelt
* simplify frame global uniform buffer handling
* fix calculating wrong size for mesh vertex buffer
  • Loading branch information
Wumpf authored Feb 28, 2023
1 parent a6ae6cd commit a358b9a
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 192 deletions.
9 changes: 1 addition & 8 deletions crates/re_renderer/src/allocator/uniform_buffer_fill.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,7 @@ pub fn create_and_fill_uniform_buffer_batch<T: bytemuck::Pod>(
num_buffers as _,
);
staging_buffer.extend(content);
staging_buffer.copy_to_buffer(
ctx.active_frame
.frame_global_command_encoder
.lock()
.get_or_create(&ctx.device),
&buffer,
0,
);
staging_buffer.copy_to_buffer(ctx.active_frame.encoder.lock().get(), &buffer, 0);

(0..num_buffers)
.into_iter()
Expand Down
108 changes: 48 additions & 60 deletions crates/re_renderer/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub struct RenderContext {
#[cfg(all(not(target_arch = "wasm32"), debug_assertions))] // native debug build
pub(crate) err_tracker: std::sync::Arc<crate::error_tracker::ErrorTracker>,

pub mesh_manager: MeshManager,
pub mesh_manager: RwLock<MeshManager>,
pub texture_manager_2d: TextureManager2D,
pub cpu_write_gpu_read_belt: Mutex<CpuWriteGpuReadBelt>,

Expand Down Expand Up @@ -159,19 +159,21 @@ impl RenderContext {
renderers: TypeMap::new(),
});

let mesh_manager = MeshManager::new(
device.clone(),
queue.clone(),
renderers.get_mut().get_or_create(
&shared_renderer_data,
&mut gpu_resources,
&device,
&mut resolver,
),
);
let mesh_manager = RwLock::new(MeshManager::new(renderers.get_mut().get_or_create(
&shared_renderer_data,
&mut gpu_resources,
&device,
&mut resolver,
)));
let texture_manager_2d =
TextureManager2D::new(device.clone(), queue.clone(), &mut gpu_resources.textures);

let active_frame = ActiveFrameContext {
encoder: Mutex::new(FrameGlobalCommandEncoder::new(&device)),
per_frame_data_helper: TypeMap::new(),
frame_index: 0,
};

RenderContext {
device,
queue,
Expand All @@ -193,11 +195,7 @@ impl RenderContext {

inflight_queue_submissions: Vec::new(),

active_frame: ActiveFrameContext {
frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
per_frame_data_helper: TypeMap::new(),
frame_index: 0
}
active_frame,
}
}

Expand Down Expand Up @@ -227,12 +225,19 @@ impl RenderContext {
pub fn begin_frame(&mut self) {
crate::profile_function!();

// If the currently active frame still has an encoder, we need to finish it and queue it.
// This should only ever happen for the first frame where we created an encoder for preparatory work. Every other frame we take the encoder at submit!
if self.active_frame.encoder.lock().0.is_some() {
assert!(self.active_frame.frame_index == 0, "There was still a command encoder from the previous frame at the beginning of the current. Did you forget to call RenderContext::before_submit?");
self.before_submit();
}

// Request used staging buffer back.
// TODO(andreas): If we'd control all submissions, we could move this directly after the submission which would be a bit better.
self.cpu_write_gpu_read_belt.lock().after_queue_submit();

self.active_frame = ActiveFrameContext {
frame_global_command_encoder: Mutex::new(FrameGlobalCommandEncoder(None)),
encoder: Mutex::new(FrameGlobalCommandEncoder::new(&self.device)),
frame_index: self.active_frame.frame_index + 1,
per_frame_data_helper: TypeMap::new(),
};
Expand All @@ -252,7 +257,7 @@ impl RenderContext {
re_log::debug!(?modified_paths, "got some filesystem events");
}

self.mesh_manager.begin_frame(frame_index);
self.mesh_manager.get_mut().begin_frame(frame_index);
self.texture_manager_2d.begin_frame(frame_index);

{
Expand Down Expand Up @@ -304,13 +309,7 @@ impl RenderContext {
// Unmap all staging buffers.
self.cpu_write_gpu_read_belt.lock().before_queue_submit();

if let Some(command_encoder) = self
.active_frame
.frame_global_command_encoder
.lock()
.0
.take()
{
if let Some(command_encoder) = self.active_frame.encoder.lock().0.take() {
let command_buffer = command_encoder.finish();

// TODO(andreas): For better performance, we should try to bundle this with the single submit call that is currently happening in eframe.
Expand All @@ -321,57 +320,46 @@ impl RenderContext {
}
}

impl Drop for RenderContext {
pub struct FrameGlobalCommandEncoder(Option<wgpu::CommandEncoder>);

impl FrameGlobalCommandEncoder {
fn new(device: &wgpu::Device) -> Self {
Self(Some(device.create_command_encoder(
&wgpu::CommandEncoderDescriptor {
label:
crate::DebugLabel::from("global \"before viewbuilder\" command encoder").get(),
},
)))
}

/// Gets the global encoder for a frame. Only valid within a frame.
pub fn get(&mut self) -> &mut wgpu::CommandEncoder {
self.0
.as_mut()
.expect("Frame global encoder can't be accessed outside of a frame!")
}
}

impl Drop for FrameGlobalCommandEncoder {
fn drop(&mut self) {
// Close global command encoder if there is any pending.
// Not doing so before shutdown causes errors.
if let Some(encoder) = self
.active_frame
.frame_global_command_encoder
.lock()
.0
.take()
{
// Not doing so before shutdown causes errors!
if let Some(encoder) = self.0.take() {
encoder.finish();
}
}
}

pub struct FrameGlobalCommandEncoder(Option<wgpu::CommandEncoder>);

impl FrameGlobalCommandEncoder {
/// Gets or creates a command encoder that runs before all view builder encoder.
pub fn get_or_create(&mut self, device: &wgpu::Device) -> &mut wgpu::CommandEncoder {
self.0.get_or_insert_with(|| {
device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: crate::DebugLabel::from("global \"before viewbuilder\" command encoder")
.get(),
})
})
}
}

pub struct ActiveFrameContext {
/// Command encoder for all commands that should go in before view builder are submitted.
///
/// This should be used for any gpu copy operation outside of a renderer or view builder.
/// (i.e. typically in [`crate::renderer::DrawData`] creation!)
pub frame_global_command_encoder: Mutex<FrameGlobalCommandEncoder>,
pub encoder: Mutex<FrameGlobalCommandEncoder>,

/// Utility type map that will be cleared every frame.
pub per_frame_data_helper: TypeMap,

/// Index of this frame. Is incremented for every render frame.
frame_index: u64,
}

/// Gets allocation size for a uniform buffer padded in a way that multiple can be put in a single wgpu buffer.
///
/// TODO(andreas): Once we have higher level buffer allocators this should be handled there.
pub(crate) fn uniform_buffer_allocation_size<Data>(device: &wgpu::Device) -> u64 {
let uniform_buffer_size = std::mem::size_of::<Data>();
wgpu::util::align_to(
uniform_buffer_size as u32,
device.limits().min_uniform_buffer_offset_alignment,
) as u64
}
7 changes: 1 addition & 6 deletions crates/re_renderer/src/importer/gltf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,7 @@ pub fn load_gltf_from_buffer(
meshes.insert(
mesh.index(),
(
ctx.mesh_manager.create(
&mut ctx.gpu_resources,
&ctx.texture_manager_2d,
&re_mesh,
lifetime,
)?,
ctx.mesh_manager.write().create(ctx, &re_mesh, lifetime)?,
Arc::new(re_mesh),
),
);
Expand Down
8 changes: 2 additions & 6 deletions crates/re_renderer/src/importer/obj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,8 @@ pub fn load_obj_from_buffer(
};
let gpu_mesh = ctx
.mesh_manager
.create(
&mut ctx.gpu_resources,
&ctx.texture_manager_2d,
&mesh,
lifetime,
)
.write()
.create(ctx, &mesh, lifetime)
.unwrap(); // TODO(andreas): Handle error
MeshInstance {
gpu_mesh,
Expand Down
Loading

1 comment on commit a358b9a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rust Benchmark

Benchmark suite Current: a358b9a Previous: a6ae6cd Ratio
datastore/insert/batch/rects/insert 547080 ns/iter (± 2001) 565239 ns/iter (± 10240) 0.97
datastore/latest_at/batch/rects/query 1822 ns/iter (± 4) 1828 ns/iter (± 7) 1.00
datastore/latest_at/missing_components/primary 356 ns/iter (± 0) 355 ns/iter (± 0) 1.00
datastore/latest_at/missing_components/secondaries 426 ns/iter (± 2) 424 ns/iter (± 1) 1.00
datastore/range/batch/rects/query 153367 ns/iter (± 366) 152332 ns/iter (± 250) 1.01
mono_points_arrow/generate_message_bundles 48110311 ns/iter (± 1001289) 51191698 ns/iter (± 955801) 0.94
mono_points_arrow/generate_messages 125932680 ns/iter (± 1151754) 136997970 ns/iter (± 2255304) 0.92
mono_points_arrow/encode_log_msg 159313386 ns/iter (± 2492490) 167925911 ns/iter (± 1184384) 0.95
mono_points_arrow/encode_total 331772897 ns/iter (± 2889071) 357179621 ns/iter (± 2266465) 0.93
mono_points_arrow/decode_log_msg 178941308 ns/iter (± 944585) 188321496 ns/iter (± 1188677) 0.95
mono_points_arrow/decode_message_bundles 65459455 ns/iter (± 696461) 73647285 ns/iter (± 1285935) 0.89
mono_points_arrow/decode_total 240397482 ns/iter (± 1449334) 259513976 ns/iter (± 2540226) 0.93
batch_points_arrow/generate_message_bundles 333180 ns/iter (± 1799) 332860 ns/iter (± 762) 1.00
batch_points_arrow/generate_messages 6183 ns/iter (± 24) 6271 ns/iter (± 14) 0.99
batch_points_arrow/encode_log_msg 355328 ns/iter (± 2045) 357590 ns/iter (± 3199) 0.99
batch_points_arrow/encode_total 712886 ns/iter (± 4005) 711172 ns/iter (± 3328) 1.00
batch_points_arrow/decode_log_msg 348785 ns/iter (± 1396) 350382 ns/iter (± 1271) 1.00
batch_points_arrow/decode_message_bundles 2097 ns/iter (± 18) 2127 ns/iter (± 14) 0.99
batch_points_arrow/decode_total 358892 ns/iter (± 1227) 356806 ns/iter (± 1498) 1.01
arrow_mono_points/insert 6053330950 ns/iter (± 33460204) 6836053421 ns/iter (± 16533220) 0.89
arrow_mono_points/query 1760977 ns/iter (± 17658) 1738813 ns/iter (± 14276) 1.01
arrow_batch_points/insert 2651772 ns/iter (± 12338) 2667543 ns/iter (± 10026) 0.99
arrow_batch_points/query 17636 ns/iter (± 83) 17655 ns/iter (± 29) 1.00
tuid/Tuid::random 34 ns/iter (± 0) 34 ns/iter (± 0) 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.