Skip to content

Commit

Permalink
Make CpuWriteGpuReadBelt texture copies easier/less error prone (#1689)
Browse files Browse the repository at this point in the history
* CpuWriteGpuReadBelt makes texture copies now easier
And has a much stronger debug check
* Use copy-extent instead of separate width/height parameters
  • Loading branch information
Wumpf authored Mar 27, 2023
1 parent 3590184 commit 3390626
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 56 deletions.
38 changes: 29 additions & 9 deletions crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{num::NonZeroU32, ops::DerefMut, sync::mpsc};

use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool};
use crate::wgpu_resources::{texture_row_data_info, BufferDesc, GpuBuffer, GpuBufferPool};

/// A sub-allocated staging buffer that can be written to.
///
Expand Down Expand Up @@ -85,26 +85,46 @@ where
self.unwritten_element_range.start
}

/// Copies the entire buffer to a texture and drops it.
pub fn copy_to_texture(
/// Copies all so far written data to a rectangle on a single 2d texture layer.
///
/// Assumes that the buffer consists of as-tightly-packed-as-possible rows of data.
/// (taking into account required padding as specified by [`wgpu::COPY_BYTES_PER_ROW_ALIGNMENT`])
///
/// Implementation note:
/// Does 2D-only entirely for convenience as it greatly simplifies the input parameters.
pub fn copy_to_texture2d(
self,
encoder: &mut wgpu::CommandEncoder,
destination: wgpu::ImageCopyTexture<'_>,
bytes_per_row: Option<NonZeroU32>,
rows_per_image: Option<NonZeroU32>,
copy_size: wgpu::Extent3d,
copy_extent: glam::UVec2,
) {
let bytes_per_row =
texture_row_data_info(destination.texture.format(), copy_extent.x).bytes_per_row_padded;

// Validate that we stay within the written part of the slice (wgpu can't fully know our intention here, so we have to check).
// We go one step further and require the size to be exactly equal - it's too unlikely that you wrote more than is needed!
// (and if you did you probably have regrets anyways!)
let required_buffer_size = bytes_per_row * copy_extent.y;
debug_assert_eq!(
required_buffer_size as usize,
self.num_written() * std::mem::size_of::<T>()
);

encoder.copy_buffer_to_texture(
wgpu::ImageCopyBuffer {
buffer: &self.chunk_buffer,
layout: wgpu::ImageDataLayout {
offset: self.byte_offset_in_chunk_buffer,
bytes_per_row,
rows_per_image,
bytes_per_row: NonZeroU32::new(bytes_per_row),
rows_per_image: None,
},
},
destination,
copy_size,
wgpu::Extent3d {
width: copy_extent.x,
height: copy_extent.y,
depth_or_array_layers: 1,
},
);
}

Expand Down
7 changes: 2 additions & 5 deletions crates/re_renderer/src/renderer/depth_cloud.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
//! behaves pretty much exactly like our point cloud renderer (see [`point_cloud.rs`]).
use smallvec::smallvec;
use std::num::NonZeroU32;

use crate::{
allocator::create_and_fill_uniform_buffer_batch,
Expand Down Expand Up @@ -330,17 +329,15 @@ fn create_and_upload_texture<T: bytemuck::Pod>(
}
}

depth_texture_staging.copy_to_texture(
depth_texture_staging.copy_to_texture2d(
ctx.active_frame.before_view_builder_encoder.lock().get(),
wgpu::ImageCopyTexture {
texture: &depth_texture.inner.texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
Some(NonZeroU32::new(bytes_per_row_padded).expect("invalid bytes per row")),
None,
depth_texture_size,
depth_cloud.depth_dimensions,
);

depth_texture
Expand Down
71 changes: 29 additions & 42 deletions crates/re_renderer/src/renderer/point_cloud.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
//! that srgb->linear conversion happens on texture load.
//!
use std::{
num::{NonZeroU32, NonZeroU64},
ops::Range,
};
use std::{num::NonZeroU64, ops::Range};

use crate::{
allocator::create_and_fill_uniform_buffer_batch, renderer::OutlineMaskProcessor, DebugLabel,
Expand All @@ -25,7 +22,6 @@ use crate::{
use bitflags::bitflags;
use bytemuck::Zeroable;
use enumset::{enum_set, EnumSet};
use itertools::Itertools;
use smallvec::smallvec;

use crate::{
Expand Down Expand Up @@ -175,7 +171,7 @@ impl PointCloudDrawData {
/// If no batches are passed, all points are assumed to be in a single batch with identity transform.
pub fn new<T>(
ctx: &mut RenderContext,
builder: PointCloudBuilder<T>,
mut builder: PointCloudBuilder<T>,
) -> Result<Self, PointCloudDrawDataError> {
crate::profile_function!();

Expand Down Expand Up @@ -266,63 +262,54 @@ impl PointCloudDrawData {
},
);

// TODO(andreas): We want a staging-belt(-like) mechanism to upload data instead of the queue.
// These staging buffers would be provided by the belt.
// To make the data upload simpler (and have it be done in one go), we always update full rows of each of our textures
// To make the data upload simpler (and have it be done in copy-operation), we always update full rows of each of our textures
let num_points_written =
wgpu::util::align_to(vertices.len() as u32, DATA_TEXTURE_SIZE) as usize;
let num_points_zeroed = num_points_written - vertices.len();
let position_and_size_staging = {
crate::profile_scope!("collect_pos_size");
vertices
.iter()
.map(|point| gpu_data::PositionData {
pos: point.position,
radius: point.radius,
})
.chain(std::iter::repeat(gpu_data::PositionData::zeroed()).take(num_points_zeroed))
.collect_vec()
};

// Upload data from staging buffers to gpu.
let size = wgpu::Extent3d {
width: DATA_TEXTURE_SIZE,
height: num_points_written as u32 / DATA_TEXTURE_SIZE,
depth_or_array_layers: 1,
};
let num_elements_padding = num_points_written - vertices.len();
let texture_copy_extent = glam::uvec2(
DATA_TEXTURE_SIZE,
num_points_written as u32 / DATA_TEXTURE_SIZE,
);

{
crate::profile_scope!("write_pos_size_texture");
ctx.queue.write_texture(

let mut staging_buffer = ctx.cpu_write_gpu_read_belt.lock().allocate(
&ctx.device,
&ctx.gpu_resources.buffers,
num_points_written,
);
staging_buffer.extend(vertices.iter().map(|point| gpu_data::PositionData {
pos: point.position,
radius: point.radius,
}));
staging_buffer.extend(
std::iter::repeat(gpu_data::PositionData::zeroed()).take(num_elements_padding),
);
staging_buffer.copy_to_texture2d(
ctx.active_frame.before_view_builder_encoder.lock().get(),
wgpu::ImageCopyTexture {
texture: &position_data_texture.texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
bytemuck::cast_slice(&position_and_size_staging),
wgpu::ImageDataLayout {
offset: 0,
bytes_per_row: NonZeroU32::new(
DATA_TEXTURE_SIZE * std::mem::size_of::<gpu_data::PositionData>() as u32,
),
rows_per_image: None,
},
size,
texture_copy_extent,
);
}

builder.color_buffer.copy_to_texture(
builder
.color_buffer
.extend(std::iter::repeat(ecolor::Color32::TRANSPARENT).take(num_elements_padding));
builder.color_buffer.copy_to_texture2d(
ctx.active_frame.before_view_builder_encoder.lock().get(),
wgpu::ImageCopyTexture {
texture: &color_texture.texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
NonZeroU32::new(DATA_TEXTURE_SIZE * std::mem::size_of::<[u8; 4]>() as u32),
None,
size,
texture_copy_extent,
);

let draw_data_uniform_buffer_bindings = create_and_fill_uniform_buffer_batch(
Expand Down

1 comment on commit 3390626

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rust Benchmark

Benchmark suite Current: 3390626 Previous: 3590184 Ratio
datastore/insert/batch/rects/insert 600453 ns/iter (± 3693) 596304 ns/iter (± 1701) 1.01
datastore/latest_at/batch/rects/query 1858 ns/iter (± 34) 1839 ns/iter (± 1) 1.01
datastore/latest_at/missing_components/primary 295 ns/iter (± 2) 290 ns/iter (± 0) 1.02
datastore/latest_at/missing_components/secondaries 435 ns/iter (± 6) 442 ns/iter (± 1) 0.98
datastore/range/batch/rects/query 152209 ns/iter (± 982) 151218 ns/iter (± 750) 1.01
mono_points_arrow/generate_message_bundles 30510877 ns/iter (± 1104506) 26921137 ns/iter (± 666418) 1.13
mono_points_arrow/generate_messages 129180999 ns/iter (± 1742998) 118083579 ns/iter (± 1027172) 1.09
mono_points_arrow/encode_log_msg 164849587 ns/iter (± 1191442) 154723587 ns/iter (± 1193846) 1.07
mono_points_arrow/encode_total 327434630 ns/iter (± 2848671) 304142327 ns/iter (± 1532975) 1.08
mono_points_arrow/decode_log_msg 184085765 ns/iter (± 1993974) 175723066 ns/iter (± 1008143) 1.05
mono_points_arrow/decode_message_bundles 59548850 ns/iter (± 1224050) 52533199 ns/iter (± 976735) 1.13
mono_points_arrow/decode_total 240984242 ns/iter (± 2473122) 227693607 ns/iter (± 1636019) 1.06
batch_points_arrow/generate_message_bundles 312256 ns/iter (± 3745) 318864 ns/iter (± 1280) 0.98
batch_points_arrow/generate_messages 5773 ns/iter (± 72) 5973 ns/iter (± 24) 0.97
batch_points_arrow/encode_log_msg 351675 ns/iter (± 3046) 347562 ns/iter (± 985) 1.01
batch_points_arrow/encode_total 685795 ns/iter (± 6090) 700172 ns/iter (± 2134) 0.98
batch_points_arrow/decode_log_msg 345143 ns/iter (± 2810) 348645 ns/iter (± 1931) 0.99
batch_points_arrow/decode_message_bundles 1533 ns/iter (± 20) 1589 ns/iter (± 7) 0.96
batch_points_arrow/decode_total 351177 ns/iter (± 2941) 349681 ns/iter (± 807) 1.00
arrow_mono_points/insert 7068719477 ns/iter (± 209325033) 6211198075 ns/iter (± 9796586) 1.14
arrow_mono_points/query 1754215 ns/iter (± 19640) 1788365 ns/iter (± 8355) 0.98
arrow_batch_points/insert 3102805 ns/iter (± 29435) 3085065 ns/iter (± 9184) 1.01
arrow_batch_points/query 16338 ns/iter (± 229) 16463 ns/iter (± 54) 0.99
arrow_batch_vecs/insert 43602 ns/iter (± 488) 44240 ns/iter (± 113) 0.99
arrow_batch_vecs/query 390186 ns/iter (± 4412) 388409 ns/iter (± 599) 1.00
tuid/Tuid::random 34 ns/iter (± 0) 34 ns/iter (± 0) 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.