From f9673ad8c83775430bd5e31f79554b56d034cfcf Mon Sep 17 00:00:00 2001
From: Milian Wolff <milian.wolff@kdab.com>
Date: Tue, 28 Jun 2022 10:50:37 +0200
Subject: [PATCH] WIP: Reducing cache choerency traffic under contention for
 spinlock

Use the code from [1] to implement the spinlock based on
std::atomic<bool> instead of std::atomic_flag. While the former
is not necessarily lock-free, it is on the majority of platforms.
A static assert is added to catch this on platforms that don't
have this - we could potentially use the older implementation on
those instead then.

WIP because I don't have a good scientific benchmark for this yet.
I tested it in our realworld application, and it seems to have
slightly reduced the load of the spinlock, but not in a really large
way...

See also: https://github.com/efficient/libcuckoo/issues/146

[1]: https://rigtorp.se/spinlock/
---
 libcuckoo/cuckoohash_map.hh | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)
diff --git a/libcuckoo/cuckoohash_map.hh b/libcuckoo/cuckoohash_map.hh
index 4cfd2ac..e063704 100644
--- a/libcuckoo/cuckoohash_map.hh
+++ b/libcuckoo/cuckoohash_map.hh
@@ -803,13 +803,11 @@ private:
   LIBCUCKOO_SQUELCH_PADDING_WARNING
   class LIBCUCKOO_ALIGNAS(64) spinlock {
   public:
-    spinlock() : elem_counter_(0), is_migrated_(true) { lock_.clear(); }
+    spinlock() : elem_counter_(0), is_migrated_(true) {}
 
     spinlock(const spinlock &other) noexcept
         : elem_counter_(other.elem_counter()),
-          is_migrated_(other.is_migrated()) {
-      lock_.clear();
-    }
+          is_migrated_(other.is_migrated()) {}
 
     spinlock &operator=(const spinlock &other) noexcept {
       elem_counter() = other.elem_counter();
@@ -818,14 +816,27 @@ private:
     }
 
     void lock() noexcept {
-      while (lock_.test_and_set(std::memory_order_acq_rel))
-        ;
+      for (;;) {
+        // Optimistically assume the lock is free on the first try
+        if (!lock_.exchange(true, std::memory_order_acquire)) {
+          return;
+        }
+        // Wait for lock to be released without generating cache misses
+        while (lock_.load(std::memory_order_relaxed)) {
+          // Issue X86 PAUSE or ARM YIELD instruction to reduce contention
+          // between hyper-threads
+          __builtin_ia32_pause();
+        }
+      }
     }
 
-    void unlock() noexcept { lock_.clear(std::memory_order_release); }
+    void unlock() noexcept { lock_.store(false, std::memory_order_release); }
 
     bool try_lock() noexcept {
-      return !lock_.test_and_set(std::memory_order_acq_rel);
+      // First do a relaxed load to check if lock is free in order to prevent
+      // unnecessary cache misses if someone does while(!try_lock())
+      return !lock_.load(std::memory_order_relaxed) &&
+             !lock_.exchange(true, std::memory_order_acquire);
     }
 
     counter_type &elem_counter() noexcept { return elem_counter_; }
@@ -835,7 +846,7 @@ private:
     bool is_migrated() const noexcept { return is_migrated_; }
 
   private:
-    std::atomic_flag lock_;
+    std::atomic<bool> lock_ = {0};
     counter_type elem_counter_;
     bool is_migrated_;
   };