Skip to content

Commit

Permalink
Reduce memory used by staging belts on Web (#1836)
Browse files Browse the repository at this point in the history
In particular this prevents crashing with out of memory on a run-away belt memory usage caused by failure to unmap buffers. A bit concerningly, the fix uses our knowledge of how `wgpu::Device::poll` is broken in the current wgpu version.

I took the opportunity to sharpens the definition of `HardwareTier` a bit.
  • Loading branch information
Wumpf authored Apr 13, 2023
1 parent 8ce2c89 commit 4ef2c07
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 25 deletions.
29 changes: 16 additions & 13 deletions crates/re_renderer/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
///
/// To reduce complexity, we don't do fine-grained feature checks,
/// but instead support set of features, each a superset of the next.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum HardwareTier {
/// For WebGL and native OpenGL. Maintains strict WebGL capability.
Web,
/// Limited feature support as provided by WebGL and native GLES2/OpenGL3(ish).
Gles,

/// Run natively with Vulkan/Metal but don't demand anything that isn't widely available.
Native,
/// Full support of WebGPU spec without additional feature requirements.
///
/// Expecting to run either in a stable WebGPU implementation.
/// I.e. either natively with Vulkan/Metal or in a browser with WebGPU support.
FullWebGpuSupport,
// Run natively with Vulkan/Metal and require additional features.
//HighEnd
}
Expand All @@ -17,16 +20,16 @@ impl HardwareTier {
/// Whether the current hardware tier supports sampling from textures with a sample count higher than 1.
pub fn support_sampling_msaa_texture(&self) -> bool {
match self {
HardwareTier::Web => false,
HardwareTier::Native => true,
HardwareTier::Gles => false,
HardwareTier::FullWebGpuSupport => true,
}
}

/// Whether the current hardware tier supports sampling from textures with a sample count higher than 1.
pub fn support_depth_readback(&self) -> bool {
match self {
HardwareTier::Web => false,
HardwareTier::Native => true,
HardwareTier::Gles => false,
HardwareTier::FullWebGpuSupport => true,
}
}
}
Expand All @@ -35,9 +38,9 @@ impl Default for HardwareTier {
fn default() -> Self {
// Use "Basic" tier for actual web but also if someone forces the GL backend!
if supported_backends() == wgpu::Backends::GL {
HardwareTier::Web
HardwareTier::Gles
} else {
HardwareTier::Native
HardwareTier::FullWebGpuSupport
}
}
}
Expand Down Expand Up @@ -72,9 +75,9 @@ impl HardwareTier {
pub fn required_downlevel_capabilities(self) -> wgpu::DownlevelCapabilities {
wgpu::DownlevelCapabilities {
flags: match self {
HardwareTier::Web => wgpu::DownlevelFlags::empty(),
HardwareTier::Gles => wgpu::DownlevelFlags::empty(),
// Require fully WebGPU compliance for the native tier.
HardwareTier::Native => wgpu::DownlevelFlags::all(),
HardwareTier::FullWebGpuSupport => wgpu::DownlevelFlags::all(),
},
limits: Default::default(), // unused so far both here and in wgpu
shader_model: wgpu::ShaderModel::Sm4,
Expand Down
33 changes: 21 additions & 12 deletions crates/re_renderer/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use type_map::concurrent::{self, TypeMap};

use crate::{
allocator::{CpuWriteGpuReadBelt, GpuReadbackBelt},
config::RenderContextConfig,
config::{HardwareTier, RenderContextConfig},
global_bindings::GlobalBindings,
renderer::Renderer,
resource_managers::{MeshManager, TextureManager2D},
Expand Down Expand Up @@ -78,14 +78,11 @@ impl Renderers {
impl RenderContext {
/// Chunk size for our cpu->gpu buffer manager.
///
/// For native: 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
/// For web (memory constraint!): 8MiB
#[cfg(not(target_arch = "wasm32"))]
/// 32MiB chunk size (as big as a for instance a 2048x1024 float4 texture)
/// (it's tempting to use something smaller on Web, but this may just cause more
/// buffers to be allocated the moment we want to upload a bigger chunk)
const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
wgpu::BufferSize::new(1024 * 1024 * 32);
#[cfg(target_arch = "wasm32")]
const CPU_WRITE_GPU_READ_BELT_DEFAULT_CHUNK_SIZE: Option<wgpu::BufferSize> =
wgpu::BufferSize::new(1024 * 1024 * 8);

/// Chunk size for our gpu->cpu buffer manager.
///
Expand Down Expand Up @@ -210,14 +207,26 @@ impl RenderContext {
fn poll_device(&mut self) {
crate::profile_function!();

// Browsers don't let us wait for GPU work via `poll`.
// * WebGPU: `poll` is a no-op as the spec doesn't specify it at all.
// Browsers don't let us wait for GPU work via `poll`:
//
// * WebGPU: `poll` is a no-op as the spec doesn't specify it at all. Calling it doesn't hurt though.
//
// * WebGL: Internal timeout can't go above a browser specific value.
// Since wgpu ran into issues in the past with some browsers returning errors,
// it uses a timeout of zero and ignores errors there.
// TODO(andreas): That's not the only thing that's weird with `maintain` in general.
// See https://github.com/gfx-rs/wgpu/issues/3601
if cfg!(target_arch = "wasm32") {
//
// This causes unused buffers to be freed immediately, which is wrong but also doesn't hurt
// since WebGL doesn't care about freeing buffers/textures that are still in use.
// Meaning, that from our POV we're actually freeing cpu memory that we wanted to free anyways.
// *More importantly this means that we get buffers from the staging belts back earlier!*
// Therefore, we just always "block" instead on WebGL to free as early as possible,
// knowing that we're not _actually_ blocking.
//
// For more details check https://github.com/gfx-rs/wgpu/issues/3601
if cfg!(target_arch = "wasm32")
&& self.shared_renderer_data.config.hardware_tier == HardwareTier::Gles
{
self.device.poll(wgpu::Maintain::Wait);
return;
}

Expand Down

1 comment on commit 4ef2c07

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rust Benchmark

Benchmark suite Current: 4ef2c07 Previous: 8ce2c89 Ratio
datastore/num_rows=1000/num_instances=1000/packed=false/insert/default 3000273 ns/iter (± 135246) 2805157 ns/iter (± 21952) 1.07
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/default 371 ns/iter (± 1) 371 ns/iter (± 2) 1
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/default 267 ns/iter (± 0) 268 ns/iter (± 7) 1.00
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/default 424 ns/iter (± 3) 423 ns/iter (± 1) 1.00
datastore/num_rows=1000/num_instances=1000/packed=false/range/default 3178345 ns/iter (± 181338) 2957379 ns/iter (± 33811) 1.07
datastore/num_rows=1000/num_instances=1000/gc/default 2375881 ns/iter (± 28687) 2373834 ns/iter (± 13792) 1.00
mono_points_arrow/generate_message_bundles 29850491 ns/iter (± 1206110) 27714050 ns/iter (± 1421175) 1.08
mono_points_arrow/generate_messages 126393503 ns/iter (± 1153225) 111333705 ns/iter (± 1096683) 1.14
mono_points_arrow/encode_log_msg 155936174 ns/iter (± 2432731) 142486652 ns/iter (± 1140263) 1.09
mono_points_arrow/encode_total 314229721 ns/iter (± 2619665) 284107636 ns/iter (± 3211586) 1.11
mono_points_arrow/decode_log_msg 191152086 ns/iter (± 1376183) 178211507 ns/iter (± 707486) 1.07
mono_points_arrow/decode_message_bundles 70193258 ns/iter (± 942602) 58202707 ns/iter (± 1385753) 1.21
mono_points_arrow/decode_total 259465238 ns/iter (± 1434917) 236544483 ns/iter (± 1347321) 1.10
mono_points_arrow_batched/generate_message_bundles 20435269 ns/iter (± 1821856) 21687589 ns/iter (± 1000022) 0.94
mono_points_arrow_batched/generate_messages 4287229 ns/iter (± 312108) 4044695 ns/iter (± 82484) 1.06
mono_points_arrow_batched/encode_log_msg 1383807 ns/iter (± 8675) 1385815 ns/iter (± 4211) 1.00
mono_points_arrow_batched/encode_total 30139038 ns/iter (± 1931856) 27995745 ns/iter (± 869388) 1.08
mono_points_arrow_batched/decode_log_msg 780267 ns/iter (± 4217) 778589 ns/iter (± 2042) 1.00
mono_points_arrow_batched/decode_message_bundles 7925993 ns/iter (± 269049) 7674435 ns/iter (± 142817) 1.03
mono_points_arrow_batched/decode_total 9145441 ns/iter (± 678645) 8622349 ns/iter (± 187213) 1.06
batch_points_arrow/generate_message_bundles 239434 ns/iter (± 1454) 238655 ns/iter (± 1157) 1.00
batch_points_arrow/generate_messages 5056 ns/iter (± 25) 5130 ns/iter (± 52) 0.99
batch_points_arrow/encode_log_msg 262205 ns/iter (± 1948) 258393 ns/iter (± 1307) 1.01
batch_points_arrow/encode_total 533987 ns/iter (± 3055) 536381 ns/iter (± 2567) 1.00
batch_points_arrow/decode_log_msg 210456 ns/iter (± 944) 212006 ns/iter (± 10710) 0.99
batch_points_arrow/decode_message_bundles 1854 ns/iter (± 15) 1817 ns/iter (± 20) 1.02
batch_points_arrow/decode_total 223379 ns/iter (± 1344) 219673 ns/iter (± 2167) 1.02
arrow_mono_points/insert 2555769741 ns/iter (± 4516343) 2301485408 ns/iter (± 5989184) 1.11
arrow_mono_points/query 1224433 ns/iter (± 15040) 1222200 ns/iter (± 14817) 1.00
arrow_batch_points/insert 1159167 ns/iter (± 5615) 1135050 ns/iter (± 19020) 1.02
arrow_batch_points/query 14598 ns/iter (± 388) 14564 ns/iter (± 248) 1.00
arrow_batch_vecs/insert 26688 ns/iter (± 144) 25893 ns/iter (± 263) 1.03
arrow_batch_vecs/query 325766 ns/iter (± 1648) 321555 ns/iter (± 3782) 1.01
tuid/Tuid::random 34 ns/iter (± 0) 34 ns/iter (± 1) 1

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.