From 1fba25d783245320dbb3c09eae1263c184031d78 Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Fri, 31 Mar 2023 16:53:13 +0200
Subject: [PATCH 1/2] Replace TextureRowDataInfo with the more versatile
 Texture2DBufferInfo

---
 .../src/allocator/cpu_write_gpu_read_belt.rs  | 10 +-
 .../src/allocator/gpu_readback_belt.rs        | 10 +-
 .../src/draw_phases/picking_layer.rs          | 44 ++++-----
 .../re_renderer/src/draw_phases/screenshot.rs | 13 ++-
 .../re_renderer/src/renderer/depth_cloud.rs   | 21 ++---
 crates/re_renderer/src/wgpu_resources/mod.rs  | 91 +++++++++++++++----
 6 files changed, 117 insertions(+), 72 deletions(-)
diff --git a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
index db9f14b0a8fa..a3900e18a143 100644
--- a/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
+++ b/crates/re_renderer/src/allocator/cpu_write_gpu_read_belt.rs
@@ -1,6 +1,6 @@
 use std::{num::NonZeroU32, sync::mpsc};
 
-use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, TextureRowDataInfo};
+use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, Texture2DBufferInfo};
 
 /// A sub-allocated staging buffer that can be written to.
 ///
@@ -100,15 +100,13 @@ where
         destination: wgpu::ImageCopyTexture<'_>,
         copy_extent: glam::UVec2,
     ) {
-        let bytes_per_row = TextureRowDataInfo::new(destination.texture.format(), copy_extent.x)
-            .bytes_per_row_padded;
+        let buffer_info = Texture2DBufferInfo::new(destination.texture.format(), copy_extent);
 
         // Validate that we stay within the written part of the slice (wgpu can't fully know our intention here, so we have to check).
         // We go one step further and require the size to be exactly equal - it's too unlikely that you wrote more than is needed!
         // (and if you did you probably have regrets anyways!)
-        let required_buffer_size = bytes_per_row * copy_extent.y;
         debug_assert_eq!(
-            required_buffer_size as usize,
+            buffer_info.buffer_size_padded as usize,
             self.num_written() * std::mem::size_of::<T>()
         );
 
@@ -117,7 +115,7 @@ where
                 buffer: &self.chunk_buffer,
                 layout: wgpu::ImageDataLayout {
                     offset: self.byte_offset_in_chunk_buffer,
-                    bytes_per_row: NonZeroU32::new(bytes_per_row),
+                    bytes_per_row: NonZeroU32::new(buffer_info.bytes_per_row_padded),
                     rows_per_image: None,
                 },
             },
diff --git a/crates/re_renderer/src/allocator/gpu_readback_belt.rs b/crates/re_renderer/src/allocator/gpu_readback_belt.rs
index 09fcdd981054..6facafc3e761 100644
--- a/crates/re_renderer/src/allocator/gpu_readback_belt.rs
+++ b/crates/re_renderer/src/allocator/gpu_readback_belt.rs
@@ -1,6 +1,6 @@
 use std::{num::NonZeroU32, ops::Range, sync::mpsc};
 
-use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, TextureRowDataInfo};
+use crate::wgpu_resources::{BufferDesc, GpuBuffer, GpuBufferPool, Texture2DBufferInfo};
 
 /// Identifier used to identify a buffer upon retrieval of the data.
 ///
@@ -61,13 +61,11 @@ impl GpuReadbackBuffer {
                 source.texture.format().describe().block_size as u64,
             );
 
-            let bytes_per_row = TextureRowDataInfo::new(source.texture.format(), copy_extents.x)
-                .bytes_per_row_padded;
-            let num_bytes = bytes_per_row * copy_extents.y;
+            let buffer_info = Texture2DBufferInfo::new(source.texture.format(), *copy_extents);
 
             // Validate that stay within the slice (wgpu can't fully know our intention here, so we have to check).
             debug_assert!(
-                (num_bytes as u64) <= self.range_in_chunk.end - start_offset,
+                buffer_info.buffer_size_padded <= self.range_in_chunk.end - start_offset,
                 "Texture data is too large to fit into the readback buffer!"
             );
 
@@ -77,7 +75,7 @@ impl GpuReadbackBuffer {
                     buffer: &self.chunk_buffer,
                     layout: wgpu::ImageDataLayout {
                         offset: start_offset,
-                        bytes_per_row: NonZeroU32::new(bytes_per_row),
+                        bytes_per_row: NonZeroU32::new(buffer_info.bytes_per_row_padded),
                         rows_per_image: None,
                     },
                 },
diff --git a/crates/re_renderer/src/draw_phases/picking_layer.rs b/crates/re_renderer/src/draw_phases/picking_layer.rs
index 8a52147077ba..5270f4bf6093 100644
--- a/crates/re_renderer/src/draw_phases/picking_layer.rs
+++ b/crates/re_renderer/src/draw_phases/picking_layer.rs
@@ -13,7 +13,7 @@ use crate::{
     allocator::create_and_fill_uniform_buffer,
     global_bindings::FrameUniformBuffer,
     view_builder::ViewBuilder,
-    wgpu_resources::{GpuBindGroup, GpuTexture, TextureDesc, TextureRowDataInfo},
+    wgpu_resources::{GpuBindGroup, GpuTexture, Texture2DBufferInfo, TextureDesc},
     DebugLabel, GpuReadbackBuffer, GpuReadbackIdentifier, IntRect, RenderContext,
 };
 
@@ -122,12 +122,23 @@ impl PickingLayerProcessor {
         readback_identifier: GpuReadbackIdentifier,
         readback_user_data: T,
     ) -> Self {
-        let row_info = TextureRowDataInfo::new(Self::PICKING_LAYER_FORMAT, picking_rect.width());
-        let buffer_size = row_info.bytes_per_row_padded * picking_rect.height();
+        let row_info_id = Texture2DBufferInfo::new(Self::PICKING_LAYER_FORMAT, picking_rect.extent);
+        //let row_info_depth =
+        //Texture2DBufferInfo::new(Self::PICKING_LAYER_FORMAT, picking_rect.extent);
+
+        // Offset of the depth buffer in the readback buffer needs to be aligned to size of a depth pixel.
+        // This is "trivially true" if the size of the depth format is a multiple of the size of the id format.
+        debug_assert!(
+            Self::PICKING_LAYER_FORMAT.describe().block_size
+                % Self::PICKING_LAYER_DEPTH_FORMAT.describe().block_size
+                == 0
+        );
+        let buffer_size = row_info_id.buffer_size_padded; // + row_info_depth.buffer_size_padded;
+
         let readback_buffer = ctx.gpu_readback_belt.lock().allocate(
             &ctx.device,
             &ctx.gpu_resources.buffers,
-            buffer_size as u64,
+            buffer_size,
             readback_identifier,
             Box::new(ReadbackBeltMetadata {
                 picking_rect,
@@ -279,31 +290,12 @@ impl PickingLayerProcessor {
         ctx.gpu_readback_belt
             .lock()
             .readback_data::<ReadbackBeltMetadata<T>>(identifier, |data, metadata| {
-                // Due to https://github.com/gfx-rs/wgpu/issues/3508 the data might be completely unaligned,
-                // so much, that we can't interpret it just as `PickingLayerId`.
-                // Therefore, we have to do a copy of the data regardless.
-                let row_info = TextureRowDataInfo::new(
+                let buffer_info_id = Texture2DBufferInfo::new(
                     Self::PICKING_LAYER_FORMAT,
-                    metadata.picking_rect.extent.x,
+                    metadata.picking_rect.extent,
                 );
 
-                // Copies need to use [u8] because of aforementioned alignment issues.
-                let mut picking_data = vec![
-                    PickingLayerId::default();
-                    (metadata.picking_rect.extent.x * metadata.picking_rect.extent.y)
-                        as usize
-                ];
-                let picking_data_as_u8 = bytemuck::cast_slice_mut(&mut picking_data);
-                for row in 0..metadata.picking_rect.extent.y {
-                    let offset_padded = (row_info.bytes_per_row_padded * row) as usize;
-                    let offset_unpadded = (row_info.bytes_per_row_unpadded * row) as usize;
-                    picking_data_as_u8[offset_unpadded
-                        ..(offset_unpadded + row_info.bytes_per_row_unpadded as usize)]
-                        .copy_from_slice(
-                            &data[offset_padded
-                                ..(offset_padded + row_info.bytes_per_row_unpadded as usize)],
-                        );
-                }
+                let picking_data = buffer_info_id.remove_padding_and_convert(data);
 
                 result = Some(PickingResult {
                     picking_data,
diff --git a/crates/re_renderer/src/draw_phases/screenshot.rs b/crates/re_renderer/src/draw_phases/screenshot.rs
index 875284986c90..68c05b3b545c 100644
--- a/crates/re_renderer/src/draw_phases/screenshot.rs
+++ b/crates/re_renderer/src/draw_phases/screenshot.rs
@@ -11,7 +11,7 @@
 //! Or alternatively try to render the images in several tiles 🤔. In any case this would greatly improve quality!
 
 use crate::{
-    wgpu_resources::{GpuTexture, TextureDesc, TextureRowDataInfo},
+    wgpu_resources::{GpuTexture, Texture2DBufferInfo, TextureDesc},
     DebugLabel, GpuReadbackBuffer, GpuReadbackIdentifier, RenderContext,
 };
 
@@ -37,12 +37,11 @@ impl ScreenshotProcessor {
         readback_identifier: GpuReadbackIdentifier,
         readback_user_data: T,
     ) -> Self {
-        let row_info = TextureRowDataInfo::new(Self::SCREENSHOT_COLOR_FORMAT, resolution.x);
-        let buffer_size = row_info.bytes_per_row_padded * resolution.y;
+        let buffer_info = Texture2DBufferInfo::new(Self::SCREENSHOT_COLOR_FORMAT, resolution);
         let screenshot_readback_buffer = ctx.gpu_readback_belt.lock().allocate(
             &ctx.device,
             &ctx.gpu_resources.buffers,
-            buffer_size as u64,
+            buffer_info.buffer_size_padded,
             readback_identifier,
             Box::new(ReadbackBeltMetadata {
                 extent: resolution,
@@ -130,9 +129,9 @@ impl ScreenshotProcessor {
             .lock()
             .readback_data::<ReadbackBeltMetadata<T>>(identifier, |data: &[u8], metadata| {
                 screenshot_was_available = Some(());
-                let texture_row_info =
-                    TextureRowDataInfo::new(Self::SCREENSHOT_COLOR_FORMAT, metadata.extent.x);
-                let texture_data = texture_row_info.remove_padding(data);
+                let buffer_info =
+                    Texture2DBufferInfo::new(Self::SCREENSHOT_COLOR_FORMAT, metadata.extent);
+                let texture_data = buffer_info.remove_padding(data);
                 on_screenshot(&texture_data, metadata.extent, metadata.user_data);
             });
         screenshot_was_available
diff --git a/crates/re_renderer/src/renderer/depth_cloud.rs b/crates/re_renderer/src/renderer/depth_cloud.rs
index 358132110442..c69def6eca10 100644
--- a/crates/re_renderer/src/renderer/depth_cloud.rs
+++ b/crates/re_renderer/src/renderer/depth_cloud.rs
@@ -21,8 +21,8 @@ use crate::{
     view_builder::ViewBuilder,
     wgpu_resources::{
         BindGroupDesc, BindGroupEntry, BindGroupLayoutDesc, GpuBindGroup, GpuBindGroupLayoutHandle,
-        GpuRenderPipelineHandle, GpuTexture, PipelineLayoutDesc, RenderPipelineDesc, TextureDesc,
-        TextureRowDataInfo,
+        GpuRenderPipelineHandle, GpuTexture, PipelineLayoutDesc, RenderPipelineDesc,
+        Texture2DBufferInfo, TextureDesc,
     },
     ColorMap, OutlineMaskPreference, PickingLayerProcessor,
 };
@@ -336,34 +336,33 @@ fn create_and_upload_texture<T: bytemuck::Pod>(
         .textures
         .alloc(&ctx.device, &depth_texture_desc);
 
-    let TextureRowDataInfo {
-        bytes_per_row_unpadded: bytes_per_row_unaligned,
-        bytes_per_row_padded,
-    } = TextureRowDataInfo::new(depth_texture_desc.format, depth_texture_desc.size.width);
-
     // Not supporting compressed formats here.
     debug_assert!(depth_texture_desc.format.describe().block_dimensions == (1, 1));
 
+    let buffer_info =
+        Texture2DBufferInfo::new(depth_texture_desc.format, depth_cloud.depth_dimensions);
+
     // TODO(andreas): CpuGpuWriteBelt should make it easier to do this.
-    let bytes_padding_per_row = (bytes_per_row_padded - bytes_per_row_unaligned) as usize;
+    let bytes_padding_per_row =
+        (buffer_info.bytes_per_row_padded - buffer_info.bytes_per_row_unpadded) as usize;
     // Sanity check the padding size. If this happens something is seriously wrong, as it would imply
     // that we can't express the required alignment with the block size.
     debug_assert!(
         bytes_padding_per_row % std::mem::size_of::<T>() == 0,
         "Padding is not a multiple of pixel size. Can't correctly pad the texture data"
     );
-    let num_pixel_padding_per_row = bytes_padding_per_row / std::mem::size_of::<T>();
 
     let mut depth_texture_staging = ctx.cpu_write_gpu_read_belt.lock().allocate::<T>(
         &ctx.device,
         &ctx.gpu_resources.buffers,
-        data.len() + num_pixel_padding_per_row * depth_texture_desc.size.height as usize,
+        buffer_info.buffer_size_padded as usize / std::mem::size_of::<T>(),
     );
 
     // Fill with a single copy if possible, otherwise do multiple, filling in padding.
-    if num_pixel_padding_per_row == 0 {
+    if bytes_padding_per_row == 0 {
         depth_texture_staging.extend_from_slice(data);
     } else {
+        let num_pixel_padding_per_row = bytes_padding_per_row / std::mem::size_of::<T>();
         for row in data.chunks(depth_texture_desc.size.width as usize) {
             depth_texture_staging.extend_from_slice(row);
             depth_texture_staging
diff --git a/crates/re_renderer/src/wgpu_resources/mod.rs b/crates/re_renderer/src/wgpu_resources/mod.rs
index ef272248e3dd..a61f4eef26bc 100644
--- a/crates/re_renderer/src/wgpu_resources/mod.rs
+++ b/crates/re_renderer/src/wgpu_resources/mod.rs
@@ -117,9 +117,9 @@ impl WgpuResourcePools {
     }
 }
 
-/// Utility for dealing with rows of raw texture data.
-#[derive(Clone, Copy)]
-pub struct TextureRowDataInfo {
+/// Utility for dealing with buffers containing raw 2D texture data.
+#[derive(Clone)]
+pub struct Texture2DBufferInfo {
     /// How many bytes per row contain actual data.
     pub bytes_per_row_unpadded: u32,
 
@@ -127,34 +127,57 @@ pub struct TextureRowDataInfo {
     ///
     /// Padding bytes are always at the end of a row.
     pub bytes_per_row_padded: u32,
+
+    /// Size required for an unpadded buffer.
+    pub buffer_size_unpadded: wgpu::BufferAddress,
+
+    /// Size required for a padded buffer as it is read/written from/to the GPU.
+    pub buffer_size_padded: wgpu::BufferAddress,
 }
 
-impl TextureRowDataInfo {
-    pub fn new(format: wgpu::TextureFormat, width: u32) -> Self {
+impl Texture2DBufferInfo {
+    #[inline]
+    pub fn new(format: wgpu::TextureFormat, extent: glam::UVec2) -> Self {
         let format_info = format.describe();
-        let width_blocks = width / format_info.block_dimensions.0 as u32;
-        let bytes_per_row_unaligned = width_blocks * format_info.block_size as u32;
+
+        let width_blocks = extent.x / format_info.block_dimensions.0 as u32;
+        let height_blocks = extent.y / format_info.block_dimensions.1 as u32;
+
+        let bytes_per_row_unpadded = width_blocks * format_info.block_size as u32;
+        let bytes_per_row_padded =
+            wgpu::util::align_to(bytes_per_row_unpadded, wgpu::COPY_BYTES_PER_ROW_ALIGNMENT);
 
         Self {
-            bytes_per_row_unpadded: bytes_per_row_unaligned,
-            bytes_per_row_padded: wgpu::util::align_to(
-                bytes_per_row_unaligned,
-                wgpu::COPY_BYTES_PER_ROW_ALIGNMENT,
-            ),
+            bytes_per_row_unpadded,
+            bytes_per_row_padded,
+            buffer_size_unpadded: (bytes_per_row_unpadded * height_blocks) as wgpu::BufferAddress,
+            buffer_size_padded: (bytes_per_row_padded * height_blocks) as wgpu::BufferAddress,
         }
     }
 
+    #[inline]
+    pub fn num_rows(&self) -> u32 {
+        self.buffer_size_padded as u32 / self.bytes_per_row_padded
+    }
+
     /// Removes the padding from a buffer containing gpu texture data.
+    ///
+    /// The buffer have the expected size for a padded buffer to hold the texture data.
+    ///
+    /// Note that if you're passing in gpu data, there no alignment guarantees on the returned slice,
+    /// do NOT convert it using [`bytemuck`]. Use [`Texture2DBufferInfo::remove_padding_and_convert`] instead.
     pub fn remove_padding<'a>(&self, buffer: &'a [u8]) -> Cow<'a, [u8]> {
+        crate::profile_function!();
+
+        assert!(buffer.len() as wgpu::BufferAddress == self.buffer_size_padded);
+
         if self.bytes_per_row_padded == self.bytes_per_row_unpadded {
             return Cow::Borrowed(buffer);
         }
 
-        let height = (buffer.len() as u32) / self.bytes_per_row_padded;
-        let mut unpadded_buffer =
-            Vec::with_capacity((self.bytes_per_row_unpadded * height) as usize);
+        let mut unpadded_buffer = Vec::with_capacity(self.buffer_size_unpadded as _);
 
-        for row in 0..height {
+        for row in 0..self.num_rows() {
             let offset = (self.bytes_per_row_padded * row) as usize;
             unpadded_buffer.extend_from_slice(
                 &buffer[offset..(offset + self.bytes_per_row_unpadded as usize)],
@@ -163,4 +186,40 @@ impl TextureRowDataInfo {
 
         unpadded_buffer.into()
     }
+
+    /// Removes the padding from a buffer containing gpu texture data and remove convert to a given type.
+    ///
+    /// The buffer have the expected size for a padded buffer to hold the texture data.
+    ///
+    /// The unpadded row size is expected to be a multiple of the size of the target type.
+    /// (Which means that, while uncommon, it technically doesn't need to be as big as a block in the pixel - this can be useful for e.g. packing wide bitfields)
+    pub fn remove_padding_and_convert<T: bytemuck::Pod>(&self, buffer: &[u8]) -> Vec<T> {
+        crate::profile_function!();
+
+        assert!(buffer.len() as wgpu::BufferAddress == self.buffer_size_padded);
+        assert!(self.bytes_per_row_unpadded % std::mem::size_of::<T>() as u32 == 0);
+
+        // Due to https://github.com/gfx-rs/wgpu/issues/3508 the data might be completely unaligned,
+        // so much, that we can't even interpret it as e.g. a u32 slice.
+        // Therefore, we have to do a copy of the data regardless of whether it's padded or not.
+
+        let mut unpadded_buffer: Vec<T> = vec![
+            T::zeroed();
+            (self.num_rows() * self.bytes_per_row_unpadded / std::mem::size_of::<T>() as u32)
+                as usize
+        ]; // Consider using unsafe set_len() instead of vec![] to avoid zeroing the memory.
+        let unpaadded_buffer_u8 = bytemuck::cast_slice_mut(&mut unpadded_buffer);
+
+        for row in 0..self.num_rows() {
+            let offset_padded = (self.bytes_per_row_padded * row) as usize;
+            let offset_unpadded = (self.bytes_per_row_unpadded * row) as usize;
+            unpaadded_buffer_u8
+                [offset_unpadded..(offset_unpadded + self.bytes_per_row_unpadded as usize)]
+                .copy_from_slice(
+                    &buffer[offset_padded..(offset_padded + self.bytes_per_row_unpadded as usize)],
+                );
+        }
+
+        unpadded_buffer
+    }
 }

From ec8adf84b4e1b0f70f7c54b53144e8a011e8f175 Mon Sep 17 00:00:00 2001
From: Andreas Reich <r_andreas2@web.de>
Date: Mon, 3 Apr 2023 10:32:04 +0200
Subject: [PATCH 2/2] comment & naming fixes

---
 crates/re_renderer/src/wgpu_resources/mod.rs | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/crates/re_renderer/src/wgpu_resources/mod.rs b/crates/re_renderer/src/wgpu_resources/mod.rs
index a61f4eef26bc..403e46350b47 100644
--- a/crates/re_renderer/src/wgpu_resources/mod.rs
+++ b/crates/re_renderer/src/wgpu_resources/mod.rs
@@ -162,7 +162,7 @@ impl Texture2DBufferInfo {
 
     /// Removes the padding from a buffer containing gpu texture data.
     ///
-    /// The buffer have the expected size for a padded buffer to hold the texture data.
+    /// The passed in buffer is to be expected to be exactly of size [`Texture2DBufferInfo::buffer_size_padded`].
     ///
     /// Note that if you're passing in gpu data, there no alignment guarantees on the returned slice,
     /// do NOT convert it using [`bytemuck`]. Use [`Texture2DBufferInfo::remove_padding_and_convert`] instead.
@@ -189,7 +189,7 @@ impl Texture2DBufferInfo {
 
     /// Removes the padding from a buffer containing gpu texture data and remove convert to a given type.
     ///
-    /// The buffer have the expected size for a padded buffer to hold the texture data.
+    /// The passed in buffer is to be expected to be exactly of size [`Texture2DBufferInfo::buffer_size_padded`].
     ///
     /// The unpadded row size is expected to be a multiple of the size of the target type.
     /// (Which means that, while uncommon, it technically doesn't need to be as big as a block in the pixel - this can be useful for e.g. packing wide bitfields)
@@ -207,13 +207,15 @@ impl Texture2DBufferInfo {
             T::zeroed();
             (self.num_rows() * self.bytes_per_row_unpadded / std::mem::size_of::<T>() as u32)
                 as usize
-        ]; // Consider using unsafe set_len() instead of vec![] to avoid zeroing the memory.
-        let unpaadded_buffer_u8 = bytemuck::cast_slice_mut(&mut unpadded_buffer);
+        ]; // TODO(andreas): Consider using unsafe set_len() instead of vec![] to avoid zeroing the memory.
+
+        // The copy has to happen on a u8 slice, because any other type would assume some alignment that we can't guarantee because of the above.
+        let unpadded_buffer_u8_view = bytemuck::cast_slice_mut(&mut unpadded_buffer);
 
         for row in 0..self.num_rows() {
             let offset_padded = (self.bytes_per_row_padded * row) as usize;
             let offset_unpadded = (self.bytes_per_row_unpadded * row) as usize;
-            unpaadded_buffer_u8
+            unpadded_buffer_u8_view
                 [offset_unpadded..(offset_unpadded + self.bytes_per_row_unpadded as usize)]
                 .copy_from_slice(
                     &buffer[offset_padded..(offset_padded + self.bytes_per_row_unpadded as usize)],