Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow initialising shaders in parallel #455

Merged
merged 7 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/tests/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{env, fs::File, path::Path, sync::Arc};
use std::{env, fs::File, num::NonZeroUsize, path::Path, sync::Arc};

use anyhow::{anyhow, bail, Result};
use vello::{
Expand Down Expand Up @@ -62,7 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result<Image> {
RendererOptions {
surface_format: None,
use_cpu: params.use_cpu,
initialise_in_parallel: false,
num_init_threads: NonZeroUsize::new(1),
antialiasing_support: vello::AaSupport::area_only(),
},
)
Expand Down
3 changes: 2 additions & 1 deletion examples/headless/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{
fs::File,
num::NonZeroUsize,
path::{Path, PathBuf},
};

Expand Down Expand Up @@ -90,7 +91,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
RendererOptions {
surface_format: None,
use_cpu: args.use_cpu,
initialise_in_parallel: false,
num_init_threads: NonZeroUsize::new(1),
DJMcNab marked this conversation as resolved.
Show resolved Hide resolved
antialiasing_support: vello::AaSupport::area_only(),
},
)
Expand Down
3 changes: 2 additions & 1 deletion examples/scenes/src/svg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ fn example_scene_of(file: PathBuf) -> ExampleScene {
.unwrap_or_else(|| "unknown".to_string());
ExampleScene {
function: Box::new(svg_function_of(name.clone(), move || {
std::fs::read_to_string(file).expect("failed to read svg file")
std::fs::read_to_string(&file)
.unwrap_or_else(|e| panic!("failed to read svg file {file:?}: {e}"))
})),
config: crate::SceneConfig {
animated: false,
Expand Down
5 changes: 4 additions & 1 deletion examples/with_bevy/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::num::NonZeroUsize;

use bevy::render::{Render, RenderSet};
use bevy::utils::synccell::SyncCell;
use vello::kurbo::{Affine, Point, Rect, Stroke};
Expand Down Expand Up @@ -29,7 +31,8 @@ impl FromWorld for VelloRenderer {
device.wgpu_device(),
RendererOptions {
surface_format: None,
initialise_in_parallel: false,
// TODO: We should ideally use the Bevy threadpool here
num_init_threads: NonZeroUsize::new(1),
antialiasing_support: vello::AaSupport::area_only(),
use_cpu: false,
},
Expand Down
24 changes: 16 additions & 8 deletions examples/with_winit/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// Also licensed under MIT license, at your choice.

use instant::{Duration, Instant};
use std::num::NonZeroUsize;
use std::{collections::HashSet, sync::Arc};

use anyhow::Result;
Expand Down Expand Up @@ -52,9 +53,20 @@ struct Args {
/// Whether to use CPU shaders
use_cpu: bool,
/// Whether to force initialising the shaders serially (rather than spawning threads)
/// This has no effect on wasm, and on macOS for performance reasons
#[arg(long)]
serial_initialisation: bool,
/// This has no effect on wasm, and defaults to 1 on macOS for performance reasons
///
/// Use `0` for an automatic choice
#[arg(long, default_value_t=default_threads())]
num_init_threads: usize,
}

fn default_threads() -> usize {
#![allow(unreachable_code)]
#[cfg(target_os = "mac")]
{
return 1;
}
0
}

struct RenderState<'s> {
Expand Down Expand Up @@ -542,11 +554,7 @@ fn run(
surface_format: Some(render_state.surface.format),
use_cpu,
antialiasing_support: vello::AaSupport::all(),
// We exclude macOS because it (supposedly) makes pipeline compilation slower
// see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
// In theory, we should only exclude metal adapters, but the difference is very minor
// wasm isn't supported
initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac"))
num_init_threads: NonZeroUsize::new(args.num_init_threads)
},
)
.expect("Could create renderer")
Expand Down
16 changes: 11 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ mod shaders;
#[cfg(feature = "wgpu")]
mod wgpu_engine;

use std::time::Instant;
use std::{num::NonZeroUsize, time::Instant};

/// Styling and composition primitives.
pub use peniko;
Expand Down Expand Up @@ -143,25 +143,31 @@ pub struct RendererOptions {
/// pipeline permutations should be compiled at startup.
pub antialiasing_support: AaSupport,

/// Whether to initialise shaders in parallel
/// How many threads to use for initialisation of shaders.
///
/// Use `Some(1)` to use a single thread. This is recommended when on macOS
/// (see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004)
///
/// Set to `None` to use a heuristic which will use many but not all threads
///
/// Has no effect on WebAssembly
pub initialise_in_parallel: bool,
pub num_init_threads: Option<NonZeroUsize>,
}

#[cfg(feature = "wgpu")]
impl Renderer {
/// Creates a new renderer for the specified device.
pub fn new(device: &Device, options: RendererOptions) -> Result<Self> {
let mut engine = WgpuEngine::new(options.use_cpu);
if options.initialise_in_parallel {
// If we are running in parallel (i.e. the number of threads is not 1)
if options.num_init_threads != NonZeroUsize::new(1) {
#[cfg(not(target_arch = "wasm32"))]
engine.use_parallel_initialisation();
}
let start = Instant::now();
let shaders = shaders::full_shaders(device, &mut engine, &options)?;
#[cfg(not(target_arch = "wasm32"))]
engine.build_shaders_if_needed(device);
engine.build_shaders_if_needed(device, options.num_init_threads);
eprintln!("Building shaders took {:?}", start.elapsed());
let blit = options
.surface_format
Expand Down
25 changes: 17 additions & 8 deletions src/wgpu_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,24 @@ impl WgpuEngine {

#[cfg(not(target_arch = "wasm32"))]
/// Initialise (in parallel) any shaders which are yet to be created
pub fn build_shaders_if_needed(&mut self, device: &Device) {
pub fn build_shaders_if_needed(
&mut self,
device: &Device,
num_threads: Option<std::num::NonZeroUsize>,
) {
use std::num::NonZeroUsize;

if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
// Try and not to use all threads
// (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead)
let threads_to_use =
std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2);
eprintln!("Initialising in parallel using {threads_to_use} threads");
let remainder =
new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| {
// Fallback onto a heuristic. This tries to not to use all threads.
// We keep the main thread blocked and not doing much whilst this is running,
// so we broadly leave two cores unused at the point of maximum parallelism
// (This choice is arbitrary, and could be tuned, although a 'proper' threadpool
// should probably be used instead)
std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2)
});
eprintln!("Initialising in parallel using {num_threads} threads");
let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads);
let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();

// We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue
Expand Down