linebender · DJMcNab · Feb 22, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs
@@ -1,4 +1,4 @@
-use std::{env, fs::File, path::Path, sync::Arc};
+use std::{env, fs::File, num::NonZeroUsize, path::Path, sync::Arc};
 
 use anyhow::{anyhow, bail, Result};
 use vello::{
@@ -62,7 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result<Image> {
         RendererOptions {
             surface_format: None,
             use_cpu: params.use_cpu,
-            initialise_in_parallel: false,
+            num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )

diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs
@@ -1,5 +1,6 @@
 use std::{
     fs::File,
+    num::NonZeroUsize,
     path::{Path, PathBuf},
 };
 
@@ -90,7 +91,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
         RendererOptions {
             surface_format: None,
             use_cpu: args.use_cpu,
-            initialise_in_parallel: false,
+            num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: vello::AaSupport::area_only(),
         },
     )

diff --git a/examples/scenes/src/svg.rs b/examples/scenes/src/svg.rs
@@ -76,7 +76,8 @@ fn example_scene_of(file: PathBuf) -> ExampleScene {
         .unwrap_or_else(|| "unknown".to_string());
     ExampleScene {
         function: Box::new(svg_function_of(name.clone(), move || {
-            std::fs::read_to_string(file).expect("failed to read svg file")
+            std::fs::read_to_string(&file)
+                .unwrap_or_else(|e| panic!("failed to read svg file {file:?}: {e}"))
         })),
         config: crate::SceneConfig {
             animated: false,

diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs
@@ -1,3 +1,5 @@
+use std::num::NonZeroUsize;
+
 use bevy::render::{Render, RenderSet};
 use bevy::utils::synccell::SyncCell;
 use vello::kurbo::{Affine, Point, Rect, Stroke};
@@ -29,7 +31,8 @@ impl FromWorld for VelloRenderer {
                 device.wgpu_device(),
                 RendererOptions {
                     surface_format: None,
-                    initialise_in_parallel: false,
+                    // TODO: We should ideally use the Bevy threadpool here
+                    num_init_threads: NonZeroUsize::new(1),
                     antialiasing_support: vello::AaSupport::area_only(),
                     use_cpu: false,
                 },

diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
@@ -15,6 +15,7 @@
 // Also licensed under MIT license, at your choice.
 
 use instant::{Duration, Instant};
+use std::num::NonZeroUsize;
 use std::{collections::HashSet, sync::Arc};
 
 use anyhow::Result;
@@ -52,9 +53,20 @@ struct Args {
     /// Whether to use CPU shaders
     use_cpu: bool,
     /// Whether to force initialising the shaders serially (rather than spawning threads)
-    /// This has no effect on wasm, and on macOS for performance reasons
-    #[arg(long)]
-    serial_initialisation: bool,
+    /// This has no effect on wasm, and defaults to 1 on macOS for performance reasons
+    ///
+    /// Use `0` for an automatic choice
+    #[arg(long, default_value_t=default_threads())]
+    num_init_threads: usize,
+}
+
+fn default_threads() -> usize {
+    #![allow(unreachable_code)]
+    #[cfg(target_os = "mac")]
+    {
+        return 1;
+    }
+    0
 }
 
 struct RenderState<'s> {
@@ -542,11 +554,7 @@ fn run(
                                     surface_format: Some(render_state.surface.format),
                                     use_cpu,
                                     antialiasing_support: vello::AaSupport::all(),
-                                    // We exclude macOS because it (supposedly) makes pipeline compilation slower
-                                    // see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004
-                                    // In theory, we should only exclude metal adapters, but the difference is very minor
-                                    // wasm isn't supported
-                                    initialise_in_parallel: !args.serial_initialisation && cfg!(not(target_os="mac"))
+                                    num_init_threads: NonZeroUsize::new(args.num_init_threads)
                                 },
                             )
                             .expect("Could create renderer")

diff --git a/src/lib.rs b/src/lib.rs
@@ -25,7 +25,7 @@ mod shaders;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 
-use std::time::Instant;
+use std::{num::NonZeroUsize, time::Instant};
 
 /// Styling and composition primitives.
 pub use peniko;
@@ -143,25 +143,31 @@ pub struct RendererOptions {
     /// pipeline permutations should be compiled at startup.
     pub antialiasing_support: AaSupport,
 
-    /// Whether to initialise shaders in parallel
+    /// How many threads to use for initialisation of shaders.
+    ///
+    /// Use `Some(1)` to use a single thread. This is recommended when on macOS
+    /// (see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004)
+    ///
+    /// Set to `None` to use a heuristic which will use many but not all threads
     ///
     /// Has no effect on WebAssembly
-    pub initialise_in_parallel: bool,
+    pub num_init_threads: Option<NonZeroUsize>,
 }
 
 #[cfg(feature = "wgpu")]
 impl Renderer {
     /// Creates a new renderer for the specified device.
     pub fn new(device: &Device, options: RendererOptions) -> Result<Self> {
         let mut engine = WgpuEngine::new(options.use_cpu);
-        if options.initialise_in_parallel {
+        // If we are running in parallel (i.e. the number of threads is not 1)
+        if options.num_init_threads != NonZeroUsize::new(1) {
             #[cfg(not(target_arch = "wasm32"))]
             engine.use_parallel_initialisation();
         }
         let start = Instant::now();
         let shaders = shaders::full_shaders(device, &mut engine, &options)?;
         #[cfg(not(target_arch = "wasm32"))]
-        engine.build_shaders_if_needed(device);
+        engine.build_shaders_if_needed(device, options.num_init_threads);
         eprintln!("Building shaders took {:?}", start.elapsed());
         let blit = options
             .surface_format

diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
@@ -151,15 +151,24 @@ impl WgpuEngine {
 
     #[cfg(not(target_arch = "wasm32"))]
     /// Initialise (in parallel) any shaders which are yet to be created
-    pub fn build_shaders_if_needed(&mut self, device: &Device) {
+    pub fn build_shaders_if_needed(
+        &mut self,
+        device: &Device,
+        num_threads: Option<std::num::NonZeroUsize>,
+    ) {
+        use std::num::NonZeroUsize;
+
         if let Some(mut new_shaders) = self.shaders_to_initialise.take() {
-            // Try and not to use all threads
-            // (This choice is arbitrary, and could be tuned, although a 'proper' work stealing system should be used instead)
-            let threads_to_use =
-                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2);
-            eprintln!("Initialising in parallel using {threads_to_use} threads");
-            let remainder =
-                new_shaders.split_off(new_shaders.len().max(threads_to_use) - threads_to_use);
+            let num_threads = num_threads.map(NonZeroUsize::get).unwrap_or_else(|| {
+                // Fallback onto a heuristic. This tries to not to use all threads.
+                // We keep the main thread blocked and not doing much whilst this is running,
+                // so we broadly leave two cores unused at the point of maximum parallelism
+                // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool
+                // should probably be used instead)
+                std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2)
+            });
+            eprintln!("Initialising in parallel using {num_threads} threads");
+            let remainder = new_shaders.split_off(new_shaders.len().max(num_threads) - num_threads);
             let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>();
 
             // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue