From b1336e4ede4a7c1a7ed1dd18e2df1325cf5d7840 Mon Sep 17 00:00:00 2001
From: James Liu <contact@jamessliu.com>
Date: Wed, 2 Nov 2022 23:40:08 +0000
Subject: [PATCH] TaskPool Panic Handling (#6443)

# Objective
Right now, the `TaskPool` implementation allows panics to permanently kill worker threads upon panicking. This is currently non-recoverable without using a `std::panic::catch_unwind` in every scheduled task. This is poor ergonomics and even poorer developer experience. This is exacerbated by #2250 as these threads are global and cannot be replaced after initialization.

Removes the need for temporary fixes like #4998. Fixes #4996. Fixes #6081. Fixes #5285. Fixes #5054. Supersedes #2307.

## Solution

The current solution is to wrap `Executor::run` in `TaskPool` with a `catch_unwind`, and discarding the potential panic. This was taken straight from [smol](https://github.com/smol-rs/smol/blob/404c7bcc0aea59b82d7347058043b8de7133241c/src/spawn.rs#L44)'s current implementation. ~~However, this is not entirely ideal as:~~

 - ~~the signaled to the awaiting task. We would need to change `Task<T>` to use `async_task::FallibleTask` internally, and even then it doesn't signal *why* it panicked, just that it did.~~ (See below).
 - ~~no error is logged of any kind~~ (See below)
 - ~~it's unclear if it drops other tasks in the executor~~ (it does not)
 - ~~This allows the ECS parallel executor to keep chugging even though a system's task has been dropped. This inevitably leads to deadlock in the executor.~~ Assuming we don't catch the unwind in ParallelExecutor, this will naturally kill the main thread.

### Alternatives
A final solution likely will incorporate elements of any or all of the following.

#### ~~Log and Ignore~~
~~Log the panic, drop the task, keep chugging. This only addresses the discoverability of the panic. The process will continue to run, probably deadlocking the executor. tokio's detatched tasks operate in this fashion.~~

Panics already do this by default, even when caught by `catch_unwind`.

#### ~~`catch_unwind` in `ParallelExecutor`~~
~~Add another layer catching system-level panics into the `ParallelExecutor`. How the executor continues when a core dependency of many systems fails to run is up for debate.~~

`async_task::Task`  bubbles up panics already, this will transitively push panics all the way to the main thread.

#### ~~Emulate/Copy `tokio::JoinHandle` with `Task<T>`~~
~~`tokio::JoinHandle<T>` bubbles up the panic from the underlying task when awaited. This can be transitively applied across other APIs that also use `Task<T>` like `Query::par_for_each` and `TaskPool::scope`, bubbling up the panic until it's either caught or it reaches the main thread.~~

`async_task::Task`  bubbles up panics already, this will transitively push panics all the way to the main thread.

#### Abort on Panic
The nuclear option. Log the error, abort the entire process on any thread in the task pool panicking. Definitely avoids any additional infrastructure for passing the panic around, and might actually lead to more efficient code as any unwinding is optimized out. However gives the developer zero options for dealing with the issue, a seemingly poor choice for debuggability, and prevents graceful shutdown of the process. Potentially an option for handling very low-level task management (a la #4740). Roughly takes the shape of:

```rust
struct AbortOnPanic;

impl Drop for AbortOnPanic {
   fn drop(&mut self) {
     abort!();
   }
}

let guard = AbortOnPanic;
// Run task
std::mem::forget(AbortOnPanic);
```

---

## Changelog

Changed: `bevy_tasks::TaskPool`'s threads  will no longer terminate permanently when a task scheduled onto them panics.
Changed: `bevy_tasks::Task` and`bevy_tasks::Scope` will propagate panics in the spawned tasks/scopes to the parent thread.
---
 crates/bevy_ecs/src/system/system_piping.rs |  4 ++--
 crates/bevy_tasks/src/task_pool.rs          | 21 ++++++++++++++-------
 crates/bevy_time/src/time.rs                |  2 +-
 crates/bevy_transform/src/systems.rs        |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/crates/bevy_ecs/src/system/system_piping.rs b/crates/bevy_ecs/src/system/system_piping.rs
index 4f4a192ff535dd..c9be4096c75b13 100644
--- a/crates/bevy_ecs/src/system/system_piping.rs
+++ b/crates/bevy_ecs/src/system/system_piping.rs
@@ -202,7 +202,7 @@ pub mod adapter {
     /// // Building a new schedule/app...
     /// # use bevy_ecs::schedule::SystemStage;
     /// # let mut sched = Schedule::default(); sched
-    /// #     .add_stage(CoreStage::Update, SystemStage::single_threaded())
+    /// #     .add_stage(CoreStage::Update, SystemStage::parallel())
     ///     .add_system_to_stage(
     ///         CoreStage::Update,
     ///         // Panic if the load system returns an error.
@@ -246,7 +246,7 @@ pub mod adapter {
     /// // Building a new schedule/app...
     /// # use bevy_ecs::schedule::SystemStage;
     /// # let mut sched = Schedule::default(); sched
-    /// #     .add_stage(CoreStage::Update, SystemStage::single_threaded())
+    /// #     .add_stage(CoreStage::Update, SystemStage::parallel())
     ///     .add_system_to_stage(
     ///         CoreStage::Update,
     ///         // If the system fails, just move on and try again next frame.
diff --git a/crates/bevy_tasks/src/task_pool.rs b/crates/bevy_tasks/src/task_pool.rs
index d2f5c05cbee957..39308c2bfd7bc4 100644
--- a/crates/bevy_tasks/src/task_pool.rs
+++ b/crates/bevy_tasks/src/task_pool.rs
@@ -118,14 +118,21 @@ impl TaskPool {
                 thread_builder
                     .spawn(move || {
                         TaskPool::LOCAL_EXECUTOR.with(|local_executor| {
-                            let tick_forever = async move {
-                                loop {
-                                    local_executor.tick().await;
+                            loop {
+                                let res = std::panic::catch_unwind(|| {
+                                    let tick_forever = async move {
+                                        loop {
+                                            local_executor.tick().await;
+                                        }
+                                    };
+                                    future::block_on(ex.run(tick_forever.or(shutdown_rx.recv())))
+                                });
+                                if let Ok(value) = res {
+                                    // Use unwrap_err because we expect a Closed error
+                                    value.unwrap_err();
+                                    break;
                                 }
-                            };
-                            let shutdown_future = ex.run(tick_forever.or(shutdown_rx.recv()));
-                            // Use unwrap_err because we expect a Closed error
-                            future::block_on(shutdown_future).unwrap_err();
+                            }
                         });
                     })
                     .expect("Failed to spawn thread.")
diff --git a/crates/bevy_time/src/time.rs b/crates/bevy_time/src/time.rs
index 89ef1c11a31fd2..111ba6f74574d0 100644
--- a/crates/bevy_time/src/time.rs
+++ b/crates/bevy_time/src/time.rs
@@ -120,7 +120,7 @@ impl Time {
     ///     world.insert_resource(time);
     ///     world.insert_resource(Health { health_value: 0.2 });
     ///
-    ///     let mut update_stage = SystemStage::single_threaded();
+    ///     let mut update_stage = SystemStage::parallel();
     ///     update_stage.add_system(health_system);
     ///
     ///     // Simulate that 30 ms have passed
diff --git a/crates/bevy_transform/src/systems.rs b/crates/bevy_transform/src/systems.rs
index 1d1c5cff45ced3..f96b56d676447e 100644
--- a/crates/bevy_transform/src/systems.rs
+++ b/crates/bevy_transform/src/systems.rs
@@ -318,8 +318,8 @@ mod test {
         let mut temp = World::new();
         let mut app = App::new();
 
-        // Adding the system in a single threaded stage. As the system will panic, this will
-        // only bring down the current test thread.
+        // FIXME: Parallel executors seem to have some odd interaction with the other
+        // tests in this crate. Using single_threaded until a root cause can be found.
         app.add_stage("single", SystemStage::single_threaded())
             .add_system_to_stage("single", transform_propagate_system);