From 9f8500dc1bf0ff64aa9129bb1239911f20305b7c Mon Sep 17 00:00:00 2001
From: Koundinya Veluri <kouvel@microsoft.com>
Date: Thu, 8 May 2025 08:48:12 -0700
Subject: [PATCH] Reduce spin-waiting in the thread pool on Arm processors

- Currently, the spin count is multiplied by 4 on Arm processors to avoid throughput regressions, but this appears to significantly increase CPU usage without much benefit
- This change removes the multiplier, restoring the spin count on Arm processors to the same value as on x64. With this, throughput appears to be mostly similar, and CPU usage is significantly reduced in many cases.
- There appear to be a few small throughput regressions in limited-connection high-throughput tests, but that seems to be mostly an artifact of limiting the connections and is not necessarily indicative of lower performance
  - In limited-connection high-throughput tests, a request is sent on a connection once the response to the previous request is received. In bursty scenarios, spin-waiting more can reduce the response time for work items queued to the thread pool, resulting in a slightly earlier response compared with spin-waiting less. The difference is typically very short, in the order of low microseconds or less. When spin-waiting less with a limited number of connections, the slight delay in response results in a slight delay in the next request being sent, and this gets compounded. Effectively, the client ends up sending fewer requests per unit of time due to this artifact, hence the lower throughput. Due to the lower CPU usage with less spin-waiting, if more connections were used, the server can actually handle the same higher RPS at lower CPU usage and with roughly the same latencies.
  - The same kind of artifact is seen in limited-connection high-throughput benchmarks to a larger degree when spin-waiting in the thread pool is disabled. Despite this change, in some scenarios it may still be more beneficial to disable spin-waiting (which many scenarios currently do without any significant loss in performance).
---
 .../Threading/PortableThreadPool.WorkerThread.cs      | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
index d7265b345cd56d..a34e0f8ff98c4e 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Threading/PortableThreadPool.WorkerThread.cs
@@ -15,16 +15,7 @@ private static partial class WorkerThread
         {
             private static readonly short ThreadsToKeepAlive = DetermineThreadsToKeepAlive();
 
-            private const int SemaphoreSpinCountDefaultBaseline = 70;
-#if !TARGET_ARM64 && !TARGET_ARM && !TARGET_LOONGARCH64
-            private const int SemaphoreSpinCountDefault = SemaphoreSpinCountDefaultBaseline;
-#else
-            // On systems with ARM processors, more spin-waiting seems to be necessary to avoid perf regressions from incurring
-            // the full wait when work becomes available soon enough. This is more noticeable after reducing the number of
-            // thread requests made to the thread pool because otherwise the extra thread requests cause threads to do more
-            // busy-waiting instead and adding to contention in trying to look for work items, which is less preferable.
-            private const int SemaphoreSpinCountDefault = SemaphoreSpinCountDefaultBaseline * 4;
-#endif
+            private const int SemaphoreSpinCountDefault = 70;
 
             // This value represents an assumption of how much uncommitted stack space a worker thread may use in the future.
             // Used in calculations to estimate when to throttle the rate of thread injection to reduce the possibility of