Reduce memory used by staging belts on Web (#1836)

In particular this prevents crashing with out of memory on a run-away belt memory usage caused by failure to unmap buffers. A bit concerningly, the fix uses our knowledge of how `wgpu::Device::poll` is broken in the current wgpu version. I took the opportunity to sharpens the definition of `HardwareTier` a bit.
rerun-io · Apr 13, 2023 · 4ef2c07 · 4ef2c07 · github-actions · Apr 13, 2023
1 parent 8ce2c89
commit 4ef2c07
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 25 deletions.
diff --git a/crates/re_renderer/src/config.rs b/crates/re_renderer/src/config.rs
@@ -2,13 +2,16 @@
 ///
 /// To reduce complexity, we don't do fine-grained feature checks,
 /// but instead support set of features, each a superset of the next.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum HardwareTier {
-    /// For WebGL and native OpenGL. Maintains strict WebGL capability.
-    Web,
+    /// Limited feature support as provided by WebGL and native GLES2/OpenGL3(ish).
+    Gles,
 
-    /// Run natively with Vulkan/Metal but don't demand anything that isn't widely available.
-    Native,
+    /// Full support of WebGPU spec without additional feature requirements.
+    ///
+    /// Expecting to run either in a stable WebGPU implementation.
+    /// I.e. either natively with Vulkan/Metal or in a browser with WebGPU support.
+    FullWebGpuSupport,
     // Run natively with Vulkan/Metal and require additional features.
     //HighEnd
 }
@@ -17,16 +20,16 @@ impl HardwareTier {
     /// Whether the current hardware tier supports sampling from textures with a sample count higher than 1.
     pub fn support_sampling_msaa_texture(&self) -> bool {
         match self {
-            HardwareTier::Web => false,
-            HardwareTier::Native => true,
+            HardwareTier::Gles => false,
+            HardwareTier::FullWebGpuSupport => true,
         }
     }
 
     /// Whether the current hardware tier supports sampling from textures with a sample count higher than 1.
     pub fn support_depth_readback(&self) -> bool {
         match self {
-            HardwareTier::Web => false,
-            HardwareTier::Native => true,
+            HardwareTier::Gles => false,
+            HardwareTier::FullWebGpuSupport => true,
         }
     }
 }
@@ -35,9 +38,9 @@ impl Default for HardwareTier {
     fn default() -> Self {
         // Use "Basic" tier for actual web but also if someone forces the GL backend!
         if supported_backends() == wgpu::Backends::GL {
-            HardwareTier::Web
+            HardwareTier::Gles
         } else {
-            HardwareTier::Native
+            HardwareTier::FullWebGpuSupport
         }
     }
 }
@@ -72,9 +75,9 @@ impl HardwareTier {
     pub fn required_downlevel_capabilities(self) -> wgpu::DownlevelCapabilities {
         wgpu::DownlevelCapabilities {
             flags: match self {
-                HardwareTier::Web => wgpu::DownlevelFlags::empty(),
+                HardwareTier::Gles => wgpu::DownlevelFlags::empty(),
                 // Require fully WebGPU compliance for the native tier.
-                HardwareTier::Native => wgpu::DownlevelFlags::all(),
+                HardwareTier::FullWebGpuSupport => wgpu::DownlevelFlags::all(),
             },
             limits: Default::default(), // unused so far both here and in wgpu
             shader_model: wgpu::ShaderModel::Sm4,

diff --git a/crates/re_renderer/src/context.rs b/crates/re_renderer/src/context.rs
@@ -5,7 +5,7 @@ use type_map::concurrent::{self, TypeMap};
 
 use crate::{
     allocator::{CpuWriteGpuReadBelt, GpuReadbackBelt},
-    config::RenderContextConfig,
+    config::{HardwareTier, RenderContextConfig},
     global_bindings::GlobalBindings,
     renderer::Renderer,
     resource_managers::{MeshManager, TextureManager2D},
@@ -78,14 +78,11 @@ impl Renderers {
 impl RenderContext {
     /// Chunk size for our cpu->gpu buffer manager.
     ///
-    /// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
-    /// For web (memory constraint!): 8MiB
-    #[cfg(not(target_arch = "wasm32"))]
+    /// 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
+    /// (it's tempting to use something smaller on Web, but this may just cause more
+    /// buffers to be allocated the moment we want to upload a bigger chunk)
     const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
         wgpu::BufferSize::new(1024 * 1024 * 32);
-    #[cfg(target_arch = "wasm32")]
-    const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
-        wgpu::BufferSize::new(1024 * 1024 * 8);
 
     /// Chunk size for our gpu->cpu buffer manager.
     ///
@@ -210,14 +207,26 @@ impl RenderContext {
     fn poll_device(&mut self) {
         crate::profile_function!();
 
-        // Browsers don't let us wait for GPU work via `poll`.
-        // * WebGPU: `poll` is a no-op as the spec doesn't specify it at all.
+        // Browsers don't let us wait for GPU work via `poll`:
+        //
+        // * WebGPU: `poll` is a no-op as the spec doesn't specify it at all. Calling it doesn't hurt though.
+        //
         // * WebGL: Internal timeout can't go above a browser specific value.
         //          Since wgpu ran into issues in the past with some browsers returning errors,
         //          it uses a timeout of zero and ignores errors there.
-        //          TODO(andreas): That's not the only thing that's weird with `maintain` in general.
-        //                          See https://github.com/gfx-rs/wgpu/issues/3601
-        if cfg!(target_arch = "wasm32") {
+        //
+        //          This causes unused buffers to be freed immediately, which is wrong but also doesn't hurt
+        //          since WebGL doesn't care about freeing buffers/textures that are still in use.
+        //          Meaning, that from our POV we're actually freeing cpu memory that we wanted to free anyways.
+        //          *More importantly this means that we get buffers from the staging belts back earlier!*
+        //          Therefore, we just always "block" instead on WebGL to free as early as possible,
+        //          knowing that we're not _actually_ blocking.
+        //
+        //          For more details check https://github.com/gfx-rs/wgpu/issues/3601
+        if cfg!(target_arch = "wasm32")
+            && self.shared_renderer_data.config.hardware_tier == HardwareTier::Gles
+        {
+            self.device.poll(wgpu::Maintain::Wait);
             return;
         }
Benchmark suite	Current: `4ef2c07`	Previous: `8ce2c89`	Ratio
`datastore/num_rows=1000/num_instances=1000/packed=false/insert/default`	`3000273` ns/iter (`± 135246`)	`2805157` ns/iter (`± 21952`)	`1.07`
`datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/default`	`371` ns/iter (`± 1`)	`371` ns/iter (`± 2`)	`1`
`datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/default`	`267` ns/iter (`± 0`)	`268` ns/iter (`± 7`)	`1.00`
`datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/default`	`424` ns/iter (`± 3`)	`423` ns/iter (`± 1`)	`1.00`
`datastore/num_rows=1000/num_instances=1000/packed=false/range/default`	`3178345` ns/iter (`± 181338`)	`2957379` ns/iter (`± 33811`)	`1.07`
`datastore/num_rows=1000/num_instances=1000/gc/default`	`2375881` ns/iter (`± 28687`)	`2373834` ns/iter (`± 13792`)	`1.00`
`mono_points_arrow/generate_message_bundles`	`29850491` ns/iter (`± 1206110`)	`27714050` ns/iter (`± 1421175`)	`1.08`
`mono_points_arrow/generate_messages`	`126393503` ns/iter (`± 1153225`)	`111333705` ns/iter (`± 1096683`)	`1.14`
`mono_points_arrow/encode_log_msg`	`155936174` ns/iter (`± 2432731`)	`142486652` ns/iter (`± 1140263`)	`1.09`
`mono_points_arrow/encode_total`	`314229721` ns/iter (`± 2619665`)	`284107636` ns/iter (`± 3211586`)	`1.11`
`mono_points_arrow/decode_log_msg`	`191152086` ns/iter (`± 1376183`)	`178211507` ns/iter (`± 707486`)	`1.07`
`mono_points_arrow/decode_message_bundles`	`70193258` ns/iter (`± 942602`)	`58202707` ns/iter (`± 1385753`)	`1.21`
`mono_points_arrow/decode_total`	`259465238` ns/iter (`± 1434917`)	`236544483` ns/iter (`± 1347321`)	`1.10`
`mono_points_arrow_batched/generate_message_bundles`	`20435269` ns/iter (`± 1821856`)	`21687589` ns/iter (`± 1000022`)	`0.94`
`mono_points_arrow_batched/generate_messages`	`4287229` ns/iter (`± 312108`)	`4044695` ns/iter (`± 82484`)	`1.06`
`mono_points_arrow_batched/encode_log_msg`	`1383807` ns/iter (`± 8675`)	`1385815` ns/iter (`± 4211`)	`1.00`
`mono_points_arrow_batched/encode_total`	`30139038` ns/iter (`± 1931856`)	`27995745` ns/iter (`± 869388`)	`1.08`
`mono_points_arrow_batched/decode_log_msg`	`780267` ns/iter (`± 4217`)	`778589` ns/iter (`± 2042`)	`1.00`
`mono_points_arrow_batched/decode_message_bundles`	`7925993` ns/iter (`± 269049`)	`7674435` ns/iter (`± 142817`)	`1.03`
`mono_points_arrow_batched/decode_total`	`9145441` ns/iter (`± 678645`)	`8622349` ns/iter (`± 187213`)	`1.06`
`batch_points_arrow/generate_message_bundles`	`239434` ns/iter (`± 1454`)	`238655` ns/iter (`± 1157`)	`1.00`
`batch_points_arrow/generate_messages`	`5056` ns/iter (`± 25`)	`5130` ns/iter (`± 52`)	`0.99`
`batch_points_arrow/encode_log_msg`	`262205` ns/iter (`± 1948`)	`258393` ns/iter (`± 1307`)	`1.01`
`batch_points_arrow/encode_total`	`533987` ns/iter (`± 3055`)	`536381` ns/iter (`± 2567`)	`1.00`
`batch_points_arrow/decode_log_msg`	`210456` ns/iter (`± 944`)	`212006` ns/iter (`± 10710`)	`0.99`
`batch_points_arrow/decode_message_bundles`	`1854` ns/iter (`± 15`)	`1817` ns/iter (`± 20`)	`1.02`
`batch_points_arrow/decode_total`	`223379` ns/iter (`± 1344`)	`219673` ns/iter (`± 2167`)	`1.02`
`arrow_mono_points/insert`	`2555769741` ns/iter (`± 4516343`)	`2301485408` ns/iter (`± 5989184`)	`1.11`
`arrow_mono_points/query`	`1224433` ns/iter (`± 15040`)	`1222200` ns/iter (`± 14817`)	`1.00`
`arrow_batch_points/insert`	`1159167` ns/iter (`± 5615`)	`1135050` ns/iter (`± 19020`)	`1.02`
`arrow_batch_points/query`	`14598` ns/iter (`± 388`)	`14564` ns/iter (`± 248`)	`1.00`
`arrow_batch_vecs/insert`	`26688` ns/iter (`± 144`)	`25893` ns/iter (`± 263`)	`1.03`
`arrow_batch_vecs/query`	`325766` ns/iter (`± 1648`)	`321555` ns/iter (`± 3782`)	`1.01`
`tuid/Tuid::random`	`34` ns/iter (`± 0`)	`34` ns/iter (`± 1`)	`1`