tokio-rs
diff --git a/‎.github/workflows/loom.yml
+14-8 b/‎.github/workflows/loom.yml
+14-8
diff --git a/‎benches/rt_multi_threaded.rs
+59-4 b/‎benches/rt_multi_threaded.rs
+59-4
diff --git a/‎tokio/src/runtime/scheduler/multi_thread/queue.rs
+77-2 b/‎tokio/src/runtime/scheduler/multi_thread/queue.rs
+77-2
diff --git a/‎tokio/src/runtime/scheduler/multi_thread/worker.rs
+39-5 b/‎tokio/src/runtime/scheduler/multi_thread/worker.rs
+39-5
@@ -24,13 +24,19 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        scope:
-          - --skip loom_pool
-          - loom_pool::group_a
-          - loom_pool::group_b
-          - loom_pool::group_c
-          - loom_pool::group_d
-          - time::driver
+        include:
+          - scope: --skip loom_pool
+            max_preemptions: 2
+          - scope: loom_pool::group_a
+            max_preemptions: 1
+          - scope: loom_pool::group_b
+            max_preemptions: 2
+          - scope: loom_pool::group_c
+            max_preemptions: 1
+          - scope: loom_pool::group_d
+            max_preemptions: 1
+          - scope: time::driver
+            max_preemptions: 2
     steps:
       - uses: actions/checkout@v3
       - name: Install Rust ${{ env.rust_stable }}
@@ -43,6 +49,6 @@ jobs:
         working-directory: tokio
         env:
           RUSTFLAGS: --cfg loom --cfg tokio_unstable -Dwarnings
-          LOOM_MAX_PREEMPTIONS: 2
+          LOOM_MAX_PREEMPTIONS: ${{ matrix.max_preemptions }}
           LOOM_MAX_BRANCHES: 10000
           SCOPE: ${{ matrix.scope }}
@@ -10,9 +10,10 @@ use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering::Relaxed;
 use std::sync::{mpsc, Arc};
 
-fn spawn_many(b: &mut Bencher) {
-    const NUM_SPAWN: usize = 10_000;
+const NUM_WORKERS: usize = 4;
+const NUM_SPAWN: usize = 10_000;
 
+fn spawn_many_local(b: &mut Bencher) {
     let rt = rt();
 
     let (tx, rx) = mpsc::sync_channel(1000);
@@ -38,6 +39,52 @@ fn spawn_many(b: &mut Bencher) {
     });
 }
 
+fn spawn_many_remote_idle(b: &mut Bencher) {
+    let rt = rt();
+
+    let mut handles = Vec::with_capacity(NUM_SPAWN);
+
+    b.iter(|| {
+        for _ in 0..NUM_SPAWN {
+            handles.push(rt.spawn(async {}));
+        }
+
+        rt.block_on(async {
+            for handle in handles.drain(..) {
+                handle.await.unwrap();
+            }
+        });
+    });
+}
+
+fn spawn_many_remote_busy(b: &mut Bencher) {
+    let rt = rt();
+    let rt_handle = rt.handle();
+    let mut handles = Vec::with_capacity(NUM_SPAWN);
+
+    // Spawn some tasks to keep the runtimes busy
+    for _ in 0..(2 * NUM_WORKERS) {
+        rt.spawn(async {
+            loop {
+                tokio::task::yield_now().await;
+                std::thread::sleep(std::time::Duration::from_micros(10));
+            }
+        });
+    }
+
+    b.iter(|| {
+        for _ in 0..NUM_SPAWN {
+            handles.push(rt_handle.spawn(async {}));
+        }
+
+        rt.block_on(async {
+            for handle in handles.drain(..) {
+                handle.await.unwrap();
+            }
+        });
+    });
+}
+
 fn yield_many(b: &mut Bencher) {
     const NUM_YIELD: usize = 1_000;
     const TASKS: usize = 200;
@@ -140,12 +187,20 @@ fn chained_spawn(b: &mut Bencher) {
 
 fn rt() -> Runtime {
     runtime::Builder::new_multi_thread()
-        .worker_threads(4)
+        .worker_threads(NUM_WORKERS)
         .enable_all()
         .build()
         .unwrap()
 }
 
-benchmark_group!(scheduler, spawn_many, ping_pong, yield_many, chained_spawn,);
+benchmark_group!(
+    scheduler,
+    spawn_many_local,
+    spawn_many_remote_idle,
+    spawn_many_remote_busy,
+    ping_pong,
+    yield_many,
+    chained_spawn,
+);
 
 benchmark_main!(scheduler);
@@ -110,6 +110,15 @@ impl<T> Local<T> {
         !self.inner.is_empty()
     }
 
+    /// How many tasks can be pushed into the queue
+    pub(crate) fn remaining_slots(&self) -> usize {
+        self.inner.remaining_slots()
+    }
+
+    pub(crate) fn max_capacity(&self) -> usize {
+        LOCAL_QUEUE_CAPACITY
+    }
+
     /// Returns false if there are any entries in the queue
     ///
     /// Separate to is_stealable so that refactors of is_stealable to "protect"
@@ -118,8 +127,62 @@ impl<T> Local<T> {
         !self.inner.is_empty()
     }
 
-    /// Pushes a task to the back of the local queue, skipping the LIFO slot.
-    pub(crate) fn push_back(
+    /// Pushes a batch of tasks to the back of the queue. All tasks must fit in
+    /// the local queue.
+    ///
+    /// # Panics
+    ///
+    /// The method panics if there is not enough capacity to fit in the queue.
+    pub(crate) fn push_back(&mut self, tasks: impl ExactSizeIterator<Item = task::Notified<T>>) {
+        let len = tasks.len();
+        assert!(len <= LOCAL_QUEUE_CAPACITY);
+
+        if len == 0 {
+            // Nothing to do
+            return;
+        }
+
+        let head = self.inner.head.load(Acquire);
+        let (steal, _) = unpack(head);
+
+        // safety: this is the **only** thread that updates this cell.
+        let mut tail = unsafe { self.inner.tail.unsync_load() };
+
+        if tail.wrapping_sub(steal) <= (LOCAL_QUEUE_CAPACITY - len) as UnsignedShort {
+            // Yes, this if condition is structured a bit weird (first block
+            // does nothing, second returns an error). It is this way to match
+            // `push_back_or_overflow`.
+        } else {
+            panic!()
+        }
+
+        for task in tasks {
+            let idx = tail as usize & MASK;
+
+            self.inner.buffer[idx].with_mut(|ptr| {
+                // Write the task to the slot
+                //
+                // Safety: There is only one producer and the above `if`
+                // condition ensures we don't touch a cell if there is a
+                // value, thus no consumer.
+                unsafe {
+                    ptr::write((*ptr).as_mut_ptr(), task);
+                }
+            });
+
+            tail = tail.wrapping_add(1);
+        }
+
+        self.inner.tail.store(tail, Release);
+    }
+
+    /// Pushes a task to the back of the local queue, if there is not enough
+    /// capacity in the queue, this triggers the overflow operation.
+    ///
+    /// When the queue overflows, half of the curent contents of the queue is
+    /// moved to the given Injection queue. This frees up capacity for more
+    /// tasks to be pushed into the local queue.
+    pub(crate) fn push_back_or_overflow(
         &mut self,
         mut task: task::Notified<T>,
         inject: &Inject<T>,
@@ -153,6 +216,11 @@ impl<T> Local<T> {
             }
         };
 
+        self.push_back_finish(task, tail);
+    }
+
+    // Second half of `push_back`
+    fn push_back_finish(&self, task: task::Notified<T>, tail: UnsignedShort) {
         // Map the position to a slot index.
         let idx = tail as usize & MASK;
 
@@ -501,6 +569,13 @@ impl<T> Drop for Local<T> {
 }
 
 impl<T> Inner<T> {
+    fn remaining_slots(&self) -> usize {
+        let (steal, _) = unpack(self.head.load(Acquire));
+        let tail = self.tail.load(Acquire);
+
+        LOCAL_QUEUE_CAPACITY - (tail.wrapping_sub(steal) as usize)
+    }
+
     fn len(&self) -> UnsignedShort {
         let (_, head) = unpack(self.head.load(Acquire));
         let tail = self.tail.load(Acquire);
 
@@ -509,8 +509,11 @@ impl Context {
                 } else {
                     // Not enough budget left to run the LIFO task, push it to
                     // the back of the queue and return.
-                    core.run_queue
-                        .push_back(task, self.worker.inject(), &mut core.metrics);
+                    core.run_queue.push_back_or_overflow(
+                        task,
+                        self.worker.inject(),
+                        &mut core.metrics,
+                    );
                     return Ok(core);
                 }
             }
@@ -612,7 +615,38 @@ impl Core {
         if self.tick % worker.handle.shared.config.global_queue_interval == 0 {
             worker.inject().pop().or_else(|| self.next_local_task())
         } else {
-            self.next_local_task().or_else(|| worker.inject().pop())
+            let maybe_task = self.next_local_task();
+
+            if maybe_task.is_some() {
+                return maybe_task;
+            }
+
+            // Other threads can only **remove** tasks from the current worker's
+            // `run_queue`. So, we can be confident that by the time we call
+            // `run_queue.push_back` below, there will be *at least* `cap`
+            // available slots in the queue.
+            let cap = usize::min(
+                self.run_queue.remaining_slots(),
+                self.run_queue.max_capacity() / 2,
+            );
+
+            // The worker is currently idle, pull a batch of work from the
+            // injection queue. We don't want to pull *all* the work so other
+            // workers can also get some.
+            let n = usize::min(
+                worker.inject().len() / worker.handle.shared.remotes.len() + 1,
+                cap,
+            );
+
+            let mut tasks = worker.inject().pop_n(n);
+
+            // Pop the first task to return immedietly
+            let ret = tasks.next();
+
+            // Push the rest of the on the run queue
+            self.run_queue.push_back(tasks);
+
+            ret
         }
     }
 
@@ -808,7 +842,7 @@ impl Handle {
         // flexibility and the task may go to the front of the queue.
         let should_notify = if is_yield || self.shared.config.disable_lifo_slot {
             core.run_queue
-                .push_back(task, &self.shared.inject, &mut core.metrics);
+                .push_back_or_overflow(task, &self.shared.inject, &mut core.metrics);
             true
         } else {
             // Push to the LIFO slot
@@ -817,7 +851,7 @@ impl Handle {
 
             if let Some(prev) = prev {
                 core.run_queue
-                    .push_back(prev, &self.shared.inject, &mut core.metrics);
+                    .push_back_or_overflow(prev, &self.shared.inject, &mut core.metrics);
             }
 
             core.lifo_slot = Some(task);