Don't do 64-bit atomics in 32-bit runtimes.

The 64-bit atomics were all on amounts of memory, so a uintptr_t should be safe. One bit of dodginess is that memory_total could possibly exceed 32-bits while there never being more than 32-bits of memory allocated at any one time.
halide · Jul 16, 2024 · 8002729 · 8002729
1 parent af4af91
commit 8002729
Showing 2 changed files with 28 additions and 12 deletions.
diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp
@@ -180,11 +180,10 @@ WEAK void sampling_profiler_thread(void *) {
 
 namespace {
 
-template<typename T>
-void sync_compare_max_and_swap(T *ptr, T val) {
+void sync_compare_max_and_swap(uintptr_t *ptr, uintptr_t val) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    T old_val = *ptr;
+    uintptr_t old_val = *ptr;
     while (val > old_val) {
         if (atomic_cas_strong_sequentially_consistent(ptr, &old_val, &val)) {
             return;
@@ -350,7 +349,11 @@ WEAK void halide_profiler_stack_peak_update(void *user_context,
     // Update per-func memory stats
     for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) {
         if (f_values[i] != 0) {
-            sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]);
+            // On 32-bit platforms we don't want to use 64-bit
+            // atomics. Fortunately on these platforms memory usage fits into
+            // 32-bit integers.
+            sync_compare_max_and_swap((uintptr_t *)(&(instance->funcs[i]).stack_peak),
+                                      (uintptr_t)(f_values[i]));
         }
     }
 }
@@ -382,15 +385,15 @@ WEAK void halide_profiler_memory_allocate(void *user_context,
 
     // Update per-instance memory stats
     atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr);
-    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr);
-    sync_compare_max_and_swap(&instance->memory_peak, p_mem_current);
+    atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_total), (uintptr_t)incr);
+    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)incr);
+    sync_compare_max_and_swap((uintptr_t *)(&instance->memory_peak), (uintptr_t)p_mem_current);
 
     // Update per-func memory stats
     atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&func->memory_total, incr);
-    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr);
-    sync_compare_max_and_swap(&func->memory_peak, f_mem_current);
+    atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_total), (uintptr_t)incr);
+    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)incr);
+    sync_compare_max_and_swap((uintptr_t *)(&func->memory_peak), (uintptr_t)f_mem_current);
 }
 
 WEAK void halide_profiler_memory_free(void *user_context,
@@ -418,10 +421,10 @@ WEAK void halide_profiler_memory_free(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-pipeline memory stats
-    atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)decr);
 
     // Update per-func memory stats
-    atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)decr);
 }
 
 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {

diff --git a/src/runtime/runtime_atomics.h b/src/runtime/runtime_atomics.h
@@ -35,36 +35,43 @@ ALWAYS_INLINE uintptr_t atomic_and_fetch_release(uintptr_t *addr, uintptr_t val)
 
 template<typename T>
 ALWAYS_INLINE T atomic_fetch_add_acquire_release(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_add(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_add_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_add(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_sub_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_sub(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_or_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_or(addr, val);
 }
 
 template<typename T>
 ALWAYS_INLINE T atomic_add_fetch_sequentially_consistent(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_add_and_fetch(addr, val);
 }
 
 template<typename T>
 ALWAYS_INLINE T atomic_sub_fetch_sequentially_consistent(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_sub_and_fetch(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE bool cas_strong_sequentially_consistent_helper(T *addr, TV *expected, TV *desired) {
+    static_assert(sizeof(T) == 4);
     TV oldval = *expected;
     TV gotval = __sync_val_compare_and_swap(addr, oldval, *desired);
     *expected = gotval;
@@ -99,11 +106,13 @@ ALWAYS_INLINE bool atomic_cas_weak_acquire_relaxed(uintptr_t *addr, uintptr_t *e
 
 template<typename T>
 ALWAYS_INLINE T atomic_fetch_and_release(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_and(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_and_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_and(addr, val);
 }
 
@@ -121,6 +130,7 @@ ALWAYS_INLINE void atomic_load_acquire(T *addr, T *val) {
 template<typename T>
 ALWAYS_INLINE T atomic_exchange_acquire(T *addr, T val) {
     // Despite the name, this is really just an exchange operation with acquire ordering.
+    static_assert(sizeof(T) == 4);
     return __sync_lock_test_and_set(addr, val);
 }
 
@@ -130,17 +140,20 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
 
 template<typename T>
 ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
 }
 
 template<typename T>
 ALWAYS_INLINE void atomic_store_release(T *addr, T *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
     __sync_synchronize();
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE void atomic_store_sequentially_consistent(T *addr, TV *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
     __sync_synchronize();
 }