Skip to content

Commit

Permalink
Optimize the depth-cloud shader when depth=0 (#1729)
Browse files Browse the repository at this point in the history
* Optimize the depth-cloud shader when depth=0

depth=0 is a degenerate case that causes a 1440x1920 depth-map
from #1538 take 75ms (!) on my
M1 MacBook Pro.

With this fix this goes down to 15 ms

However, if we zoom out a lot, so that the depth point cloud covers
a very small part of the screen, the slowness returns.

I suspect this is a GPU binning problem of some sort, where too many
points in too small of a screen area causes performence issues.

* Put happy-path first, and also handle NaNs

* var -> let

* Remove unnecessary parenthesis in wgsl if-statements
  • Loading branch information
emilk authored Mar 30, 2023
1 parent b2298e9 commit 82f5d69
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 34 deletions.
79 changes: 46 additions & 33 deletions crates/re_renderer/shader/depth_cloud.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@

// ---

struct PointData {
pos_in_world: Vec3,
unresolved_radius: f32,
color: Vec4
}

// ---

/// Keep in sync with `DepthCloudInfoUBO` in `depth_cloud.rs`.
///
/// Same for all draw-phases.
Expand Down Expand Up @@ -66,6 +58,14 @@ struct VertexOut {
@location(3) point_radius: f32,
};

// ---

struct PointData {
pos_in_world: Vec3,
unresolved_radius: f32,
color: Vec4
}

// Backprojects the depth texture using the intrinsics passed in the uniform buffer.
fn compute_point_data(quad_idx: i32) -> PointData {
let wh = textureDimensions(depth_texture);
Expand All @@ -74,26 +74,32 @@ fn compute_point_data(quad_idx: i32) -> PointData {
// TODO(cmc): expose knobs to linearize/normalize/flip/cam-to-plane depth.
let world_space_depth = depth_cloud_info.world_depth_from_texture_value * textureLoad(depth_texture, texcoords, 0).x;

// TODO(cmc): albedo textures
let color = Vec4(colormap_linear(depth_cloud_info.colormap, world_space_depth / depth_cloud_info.max_depth_in_world), 1.0);

// TODO(cmc): This assumes a pinhole camera; need to support other kinds at some point.
let intrinsics = depth_cloud_info.depth_camera_intrinsics;
let focal_length = Vec2(intrinsics[0][0], intrinsics[1][1]);
let offset = Vec2(intrinsics[2][0], intrinsics[2][1]);

let pos_in_obj = Vec3(
(Vec2(texcoords) - offset) * world_space_depth / focal_length,
world_space_depth,
);

let pos_in_world = depth_cloud_info.world_from_obj * Vec4(pos_in_obj, 1.0);

var data: PointData;
data.pos_in_world = pos_in_world.xyz;
data.unresolved_radius = depth_cloud_info.point_radius_from_world_depth * world_space_depth;
data.color = color;

if 0.0 < world_space_depth && world_space_depth < f32max {
// TODO(cmc): albedo textures
let color = Vec4(colormap_linear(depth_cloud_info.colormap, world_space_depth / depth_cloud_info.max_depth_in_world), 1.0);

// TODO(cmc): This assumes a pinhole camera; need to support other kinds at some point.
let intrinsics = depth_cloud_info.depth_camera_intrinsics;
let focal_length = Vec2(intrinsics[0][0], intrinsics[1][1]);
let offset = Vec2(intrinsics[2][0], intrinsics[2][1]);

let pos_in_obj = Vec3(
(Vec2(texcoords) - offset) * world_space_depth / focal_length,
world_space_depth,
);

let pos_in_world = depth_cloud_info.world_from_obj * Vec4(pos_in_obj, 1.0);

data.pos_in_world = pos_in_world.xyz;
data.unresolved_radius = depth_cloud_info.point_radius_from_world_depth * world_space_depth;
data.color = color;
} else {
// Degenerate case
data.pos_in_world = Vec3(0.0);
data.unresolved_radius = 0.0;
data.color = Vec4(0.0);
}
return data;
}

Expand All @@ -104,15 +110,22 @@ fn vs_main(@builtin(vertex_index) vertex_idx: u32) -> VertexOut {
// Compute point data (valid for the entire quad).
let point_data = compute_point_data(quad_idx);

// Span quad
let quad = sphere_quad_span(vertex_idx, point_data.pos_in_world, point_data.unresolved_radius, depth_cloud_info.radius_boost_in_ui_points);

var out: VertexOut;
out.pos_in_clip = frame.projection_from_world * Vec4(quad.pos_in_world, 1.0);
out.pos_in_world = quad.pos_in_world;
out.point_pos_in_world = point_data.pos_in_world;
out.point_color = point_data.color;
out.point_radius = quad.point_resolved_radius;

if 0.0 < point_data.unresolved_radius {
// Span quad
let quad = sphere_quad_span(vertex_idx, point_data.pos_in_world, point_data.unresolved_radius, depth_cloud_info.radius_boost_in_ui_points);
out.pos_in_clip = frame.projection_from_world * Vec4(quad.pos_in_world, 1.0);
out.pos_in_world = quad.pos_in_world;
out.point_radius = quad.point_resolved_radius;
} else {
// Degenerate case - early-out!
out.pos_in_clip = Vec4(0.0);
out.pos_in_world = Vec3(0.0);
out.point_radius = 0.0;
}

return out;
}
Expand Down
2 changes: 1 addition & 1 deletion crates/re_renderer/shader/instanced_mesh.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ fn fs_main_shaded(in: VertexOut) -> @location(0) Vec4 {
* material.albedo_factor.rgb
+ in.additive_tint_rgb;

if (all(in.normal_world_space == Vec3(0.0, 0.0, 0.0))) {
if all(in.normal_world_space == Vec3(0.0, 0.0, 0.0)) {
// no normal, no shading
return Vec4(albedo, 1.0);
} else {
Expand Down

1 comment on commit 82f5d69

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rust Benchmark

Benchmark suite Current: 82f5d69 Previous: 6d219e6 Ratio
datastore/num_rows=1000/num_instances=1000/packed=false/insert/default 10706570 ns/iter (± 389113)
datastore/num_rows=1000/num_instances=1000/packed=false/insert/bucketsz=0 12452965 ns/iter (± 359625)
datastore/num_rows=1000/num_instances=1000/packed=false/insert/bucketsz=2 11718709 ns/iter (± 366919)
datastore/num_rows=1000/num_instances=1000/packed=false/insert/bucketsz=32 10344431 ns/iter (± 349602)
datastore/num_rows=1000/num_instances=1000/packed=false/insert/bucketsz=2048 10205868 ns/iter (± 334515)
datastore/num_rows=1000/num_instances=1000/packed=true/insert/default 9931804 ns/iter (± 351812)
datastore/num_rows=1000/num_instances=1000/packed=true/insert/bucketsz=0 11597255 ns/iter (± 597620)
datastore/num_rows=1000/num_instances=1000/packed=true/insert/bucketsz=2 11200708 ns/iter (± 457977)
datastore/num_rows=1000/num_instances=1000/packed=true/insert/bucketsz=32 10043978 ns/iter (± 350760)
datastore/num_rows=1000/num_instances=1000/packed=true/insert/bucketsz=2048 9999497 ns/iter (± 315662)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/default 1806 ns/iter (± 37)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/bucketsz=0 1837 ns/iter (± 26)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/bucketsz=2 1829 ns/iter (± 25)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/bucketsz=32 1804 ns/iter (± 25)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at/bucketsz=2048 1797 ns/iter (± 26)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at/default 1827 ns/iter (± 29)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at/bucketsz=0 1838 ns/iter (± 24)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at/bucketsz=2 1815 ns/iter (± 24)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at/bucketsz=32 1807 ns/iter (± 22)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at/bucketsz=2048 1814 ns/iter (± 27)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/default 273 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/default 420 ns/iter (± 5)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/bucketsz=0 272 ns/iter (± 3)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/bucketsz=0 429 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/bucketsz=2 274 ns/iter (± 3)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/bucketsz=2 430 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/bucketsz=32 272 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/bucketsz=32 426 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/primary/bucketsz=2048 273 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=false/latest_at_missing/secondaries/bucketsz=2048 428 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/primary/default 272 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/secondaries/default 425 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/primary/bucketsz=0 275 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/secondaries/bucketsz=0 430 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/primary/bucketsz=2 271 ns/iter (± 3)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/secondaries/bucketsz=2 428 ns/iter (± 5)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/primary/bucketsz=32 272 ns/iter (± 3)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/secondaries/bucketsz=32 422 ns/iter (± 5)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/primary/bucketsz=2048 273 ns/iter (± 4)
datastore/num_rows=1000/num_instances=1000/packed=true/latest_at_missing/secondaries/bucketsz=2048 420 ns/iter (± 6)
datastore/num_rows=1000/num_instances=1000/packed=false/range/default 10731768 ns/iter (± 391536)
datastore/num_rows=1000/num_instances=1000/packed=false/range/bucketsz=0 2426758 ns/iter (± 24070)
datastore/num_rows=1000/num_instances=1000/packed=false/range/bucketsz=2 2356495 ns/iter (± 23638)
datastore/num_rows=1000/num_instances=1000/packed=false/range/bucketsz=32 2056590 ns/iter (± 21943)
datastore/num_rows=1000/num_instances=1000/packed=false/range/bucketsz=2048 1939902 ns/iter (± 24654)
datastore/num_rows=1000/num_instances=1000/packed=true/range/default 10241736 ns/iter (± 538329)
datastore/num_rows=1000/num_instances=1000/packed=true/range/bucketsz=0 2375451 ns/iter (± 29926)
datastore/num_rows=1000/num_instances=1000/packed=true/range/bucketsz=2 2389685 ns/iter (± 22242)
datastore/num_rows=1000/num_instances=1000/packed=true/range/bucketsz=32 2044606 ns/iter (± 23931)
datastore/num_rows=1000/num_instances=1000/packed=true/range/bucketsz=2048 1982228 ns/iter (± 22101)
mono_points_arrow/generate_message_bundles 40599288 ns/iter (± 697218) 48140756 ns/iter (± 553167) 0.84
mono_points_arrow/generate_messages 165205809 ns/iter (± 1451326) 186798797 ns/iter (± 1628346) 0.88
mono_points_arrow/encode_log_msg 205478067 ns/iter (± 1474392) 230999657 ns/iter (± 3655728) 0.89
mono_points_arrow/encode_total 412623098 ns/iter (± 2059405) 462511998 ns/iter (± 4022138) 0.89
mono_points_arrow/decode_log_msg 253241563 ns/iter (± 1595261) 276288133 ns/iter (± 1820309) 0.92
mono_points_arrow/decode_message_bundles 84861512 ns/iter (± 1103824) 101352669 ns/iter (± 1355560) 0.84
mono_points_arrow/decode_total 336380010 ns/iter (± 2357202) 372735857 ns/iter (± 2649819) 0.90
mono_points_arrow_batched/generate_message_bundles 30957516 ns/iter (± 1271417) 45633505 ns/iter (± 826076) 0.68
mono_points_arrow_batched/generate_messages 9263854 ns/iter (± 397210) 16223157 ns/iter (± 767037) 0.57
mono_points_arrow_batched/encode_log_msg 1931279 ns/iter (± 19131) 1855934 ns/iter (± 37424) 1.04
mono_points_arrow_batched/encode_total 44449171 ns/iter (± 1420466) 61330388 ns/iter (± 1387494) 0.72
mono_points_arrow_batched/decode_log_msg 1065718 ns/iter (± 9497) 1031892 ns/iter (± 38272) 1.03
mono_points_arrow_batched/decode_message_bundles 16651452 ns/iter (± 712473) 24058907 ns/iter (± 355251) 0.69
mono_points_arrow_batched/decode_total 18118866 ns/iter (± 686053) 25835228 ns/iter (± 488503) 0.70
batch_points_arrow/generate_message_bundles 280045 ns/iter (± 4128) 291854 ns/iter (± 2182) 0.96
batch_points_arrow/generate_messages 7381 ns/iter (± 99) 7839 ns/iter (± 24) 0.94
batch_points_arrow/encode_log_msg 425198 ns/iter (± 3168) 400285 ns/iter (± 3045) 1.06
batch_points_arrow/encode_total 717571 ns/iter (± 8166) 723520 ns/iter (± 10485) 0.99
batch_points_arrow/decode_log_msg 361435 ns/iter (± 3655) 352255 ns/iter (± 6589) 1.03
batch_points_arrow/decode_message_bundles 2790 ns/iter (± 38) 2941 ns/iter (± 10) 0.95
batch_points_arrow/decode_total 368416 ns/iter (± 3757) 355726 ns/iter (± 3360) 1.04
arrow_mono_points/insert 6182615719 ns/iter (± 22343798) 8198778068 ns/iter (± 96848496) 0.75
arrow_mono_points/query 1851832 ns/iter (± 20835) 2023940 ns/iter (± 250786) 0.91
arrow_batch_points/insert 3113566 ns/iter (± 30277) 3199163 ns/iter (± 275112) 0.97
arrow_batch_points/query 16716 ns/iter (± 252) 17113 ns/iter (± 26) 0.98
arrow_batch_vecs/insert 45410 ns/iter (± 646) 43284 ns/iter (± 461) 1.05
arrow_batch_vecs/query 506513 ns/iter (± 6390) 507484 ns/iter (± 1052) 1.00
tuid/Tuid::random 33 ns/iter (± 0) 34 ns/iter (± 0) 0.97

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.