From e27422adca7285bdcace8a6052860122eaa1bff7 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 25 Jul 2020 20:55:45 -0700
Subject: [PATCH 01/11] switch to using C++ atomics in MSVC as well

---
 include/mimalloc-atomic.h | 10 +++++-----
 src/bitmap.inc.c          |  8 ++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 722b6ad6..c3d0ad23 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -13,12 +13,12 @@ terms of the MIT license. A copy of the license can be found in the file
 // We need to be portable between C, C++, and MSVC.
 // ------------------------------------------------------
 
-#if defined(_MSC_VER)
-#define _Atomic(tp)         tp
-#define ATOMIC_VAR_INIT(x)  x
-#elif defined(__cplusplus)
+#if defined(__cplusplus)
 #include <atomic>
 #define  _Atomic(tp)        std::atomic<tp>
+#elif defined(_MSC_VER)
+#define _Atomic(tp)         tp
+#define ATOMIC_VAR_INIT(x)  x
 #else
 #include <stdatomic.h>
 #endif
@@ -126,7 +126,7 @@ static inline intptr_t mi_atomic_subi(volatile _Atomic(intptr_t)* p, intptr_t su
   (T*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange))
 
 
-#ifdef _MSC_VER
+#if !defined(__cplusplus) && defined(_MSC_VER)
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <intrin.h>
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index c3813a44..99e8fa6f 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -72,6 +72,14 @@ static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
 #if defined(_MSC_VER)
 #define MI_HAVE_BITSCAN
 #include <intrin.h>
+#ifndef MI_64
+#if MI_INTPTR_SIZE==8
+#define MI_64(f) f##64
+#else
+#define MI_64(f) f
+#endif
+#endif
+
 static inline size_t mi_bsf(uintptr_t x) {
   if (x==0) return 8*MI_INTPTR_SIZE;
   DWORD idx;

From 09ade024298000157729c877c2087cfe2d762454 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 25 Jul 2020 22:52:27 -0700
Subject: [PATCH 02/11] bring inline with C11 atomics; no volatile and cas
 order of expected/desired

---
 include/mimalloc-atomic.h | 166 +++++++++++++++++---------------------
 include/mimalloc-types.h  |  36 ++++-----
 src/alloc.c               |  15 ++--
 src/arena.c               |   2 +-
 src/bitmap.inc.c          |  11 ++-
 src/options.c             |   6 +-
 src/os.c                  |  19 ++---
 src/page.c                |  26 +++---
 src/random.c              |   2 +-
 src/region.c              |  14 ++--
 src/segment.c             |  37 +++++----
 src/stats.c               |  24 +++---
 12 files changed, 170 insertions(+), 188 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index c3d0ad23..beb0f12c 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -27,103 +27,99 @@ terms of the MIT license. A copy of the license can be found in the file
 // Atomic operations specialized for mimalloc
 // ------------------------------------------------------
 
-// Atomically add a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add);
+// Atomically add a value; returns the previous value. Memory ordering is acquire-release.
+static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add);
 
-// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+// Atomically "and" a value; returns the previous value. Memory ordering is acquire-release.
+static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x);
 
-// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+// Atomically "or" a value; returns the previous value. Memory ordering is acquire-release.
+static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x);
 
 // Atomically compare and exchange a value; returns `true` if successful.
-// May fail spuriously. Memory ordering as release on success, and relaxed on failure.
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
+// May fail spuriously. Memory ordering is acquire-release; with acquire on failure.
+static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
 
 // Atomically compare and exchange a value; returns `true` if successful.
-// Memory ordering is acquire-release
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
+// Memory ordering is acquire-release; with acquire on failure.
+static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
 
 // Atomically exchange a value. Memory ordering is acquire-release.
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);
+static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange);
 
 // Atomically read a value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);
+static inline uintptr_t mi_atomic_read_relaxed(const _Atomic(uintptr_t)* p);
 
 // Atomically read a value. Memory ordering is acquire.
-static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p);
+static inline uintptr_t mi_atomic_read(const _Atomic(uintptr_t)* p);
 
 // Atomically write a value. Memory ordering is release.
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x);
 
 // Yield
 static inline void mi_atomic_yield(void);
 
-// Atomically add a 64-bit value; returns the previous value.
+// Atomically add a 64-bit value; returns the previous value. Memory ordering is relaxed.
 // Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add);
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add);
 
 // Atomically update `*p` with the maximum of `*p` and `x` as a 64-bit value.
 // Returns the previous value. Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline void mi_atomic_maxi64(volatile int64_t* p, int64_t x);
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x);
 
-// Atomically read a 64-bit value
-// Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline int64_t mi_atomic_readi64(volatile int64_t* p);
 
 // Atomically subtract a value; returns the previous value.
-static inline uintptr_t mi_atomic_sub(volatile _Atomic(uintptr_t)* p, uintptr_t sub) {
+static inline uintptr_t mi_atomic_sub(_Atomic(uintptr_t)* p, uintptr_t sub) {
   return mi_atomic_add(p, (uintptr_t)(-((intptr_t)sub)));
 }
 
 // Atomically increment a value; returns the incremented result.
-static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) {
+static inline uintptr_t mi_atomic_increment(_Atomic(uintptr_t)* p) {
   return mi_atomic_add(p, 1);
 }
 
 // Atomically decrement a value; returns the decremented result.
-static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
+static inline uintptr_t mi_atomic_decrement(_Atomic(uintptr_t)* p) {
   return mi_atomic_sub(p, 1);
 }
 
 // Atomically add a signed value; returns the previous value.
-static inline intptr_t mi_atomic_addi(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)mi_atomic_add((volatile _Atomic(uintptr_t)*)p, (uintptr_t)add);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)* p, intptr_t add) {
+  return (intptr_t)mi_atomic_add((_Atomic(uintptr_t)*)p, (uintptr_t)add);
 }
 
 // Atomically subtract a signed value; returns the previous value.
-static inline intptr_t mi_atomic_subi(volatile _Atomic(intptr_t)* p, intptr_t sub) {
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)* p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p,-sub);
 }
 
 // Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic).
 #define mi_atomic_read_ptr_relaxed(T,p)  \
-  (T*)(mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)(p)))
+  (T*)(mi_atomic_read_relaxed((const _Atomic(uintptr_t)*)(p)))
 
 // Atomically read a pointer; Memory order is acquire.
 #define mi_atomic_read_ptr(T,p) \
-  (T*)(mi_atomic_read((const volatile _Atomic(uintptr_t)*)(p)))
+  (T*)(mi_atomic_read((const _Atomic(uintptr_t)*)(p)))
 
 // Atomically write a pointer; Memory order is acquire.
 #define mi_atomic_write_ptr(T,p,x) \
-  mi_atomic_write((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)x))
+  mi_atomic_write((_Atomic(uintptr_t)*)(p), (uintptr_t)((T*)x))
+
+
+static inline bool mi_atomic_cas_weak_voidp(_Atomic(void*)*p, void** expected, void* desired, void* unused) {
+  (void)(unused);
+  return mi_atomic_cas_weak((_Atomic(uintptr_t)*)p, (uintptr_t*)expected, (uintptr_t)desired);
+}
 
 // Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
 // Memory order is release. (like a write)
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-#define mi_atomic_cas_ptr_weak(T,p,desired,expected) \
-  mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected)))
+#define mi_atomic_cas_ptr_weak(T,p,expected,desired) \
+  mi_atomic_cas_weak_voidp((_Atomic(void*)*)(p), (void**)(expected), desired, *(expected))
     
-// Atomically compare and exchange a pointer; returns `true` if successful. Memory order is acquire_release.
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-#define mi_atomic_cas_ptr_strong(T,p,desired,expected) \
-  mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)(p),(uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected))) 
 
 // Atomically exchange a pointer value.
 #define mi_atomic_exchange_ptr(T,p,exchange) \
-  (T*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange))
+  (T*)mi_atomic_exchange((_Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange))
 
 
 #if !defined(__cplusplus) && defined(_MSC_VER)
@@ -137,31 +133,38 @@ typedef LONG64   msc_intptr_t;
 typedef LONG     msc_intptr_t;
 #define MI_64(f) f
 #endif
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
   return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
   return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
   return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
 }
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return mi_atomic_cas_strong(p,desired,expected);
+static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
+  return mi_atomic_cas_strong(p,expected,desired);
 }
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange) {
   return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
+static inline uintptr_t mi_atomic_read(_Atomic(uintptr_t) const* p) {
   return *p;
 }
-static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+static inline uintptr_t mi_atomic_read_relaxed(_Atomic(uintptr_t) const* p) {
   return *p;
 }
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x) {
   #if defined(_M_IX86) || defined(_M_X64)
   *p = x;
   #else
@@ -171,9 +174,9 @@ static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x)
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
-static inline void mi_atomic_addi64(volatile _Atomic(int64_t)* p, int64_t add) {
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)* p, int64_t add) {
   #ifdef _WIN64
-  mi_atomic_addi(p,add);
+  return (int64_t)mi_atomic_addi((int64_t*)p,add);
   #else
   int64_t current;
   int64_t sum;
@@ -181,84 +184,67 @@ static inline void mi_atomic_addi64(volatile _Atomic(int64_t)* p, int64_t add) {
     current = *p;
     sum = current + add;
   } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
   #endif
 }
 
-static inline void mi_atomic_maxi64(volatile _Atomic(int64_t)*p, int64_t x) {
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
   int64_t current;
   do {
     current = *p;
   } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }
 
-static inline int64_t mi_atomic_readi64(volatile _Atomic(int64_t)*p) {
-  #ifdef _WIN64
-  return *p;
-  #else
-  int64_t current;
-  do {
-    current = *p;
-  } while (_InterlockedCompareExchange64(p, current, current) != current);
-  return current;
-  #endif
-}
-
 #else
 #ifdef __cplusplus
 #define  MI_USING_STD   using namespace std;
 #else
 #define  MI_USING_STD
 #endif
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
+  return atomic_fetch_add_explicit(p, add, memory_order_acq_rel);
 }
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
   return atomic_fetch_and_explicit(p, x, memory_order_acq_rel);
 }
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
   return atomic_fetch_or_explicit(p, x, memory_order_acq_rel);
 }
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
   MI_USING_STD
-  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_acquire);
+  return atomic_compare_exchange_weak_explicit(p, expected, desired, memory_order_acq_rel, memory_order_acquire);
 }
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
   MI_USING_STD
-  return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_acquire);
+  return atomic_compare_exchange_strong_explicit(p, expected, desired, memory_order_acq_rel, memory_order_acquire);
 }
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange) {
   MI_USING_STD
   return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
 }
-static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) {
+static inline uintptr_t mi_atomic_read_relaxed(const _Atomic(uintptr_t)* p) {
   MI_USING_STD
-  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
+  return atomic_load_explicit((_Atomic(uintptr_t)*) p, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) {
+static inline uintptr_t mi_atomic_read(const _Atomic(uintptr_t)* p) {
   MI_USING_STD
-  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire);
+  return atomic_load_explicit((_Atomic(uintptr_t)*) p, memory_order_acquire);
 }
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
   return atomic_store_explicit(p, x, memory_order_release);
 }
-static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add) {
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
   MI_USING_STD
-  atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
+  return atomic_fetch_add_explicit((_Atomic(int64_t)*)p, add, memory_order_relaxed);
 }
-static inline int64_t mi_atomic_readi64(volatile int64_t* p) {
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
   MI_USING_STD
-  return atomic_load_explicit((volatile _Atomic(int64_t)*) p, memory_order_relaxed);
-}
-static inline void mi_atomic_maxi64(volatile int64_t* p, int64_t x) {
-  MI_USING_STD
-  int64_t current;
-  do {
-    current = mi_atomic_readi64(p);
-  } while (current < x && !atomic_compare_exchange_weak_explicit((volatile _Atomic(int64_t)*)p, &current, x, memory_order_acq_rel, memory_order_relaxed));
+  int64_t current = atomic_load_explicit((_Atomic(int64_t)*)p, memory_order_relaxed);
+  while (current < x && !atomic_compare_exchange_weak_explicit((_Atomic(int64_t)*)p, &current, x, memory_order_acq_rel, memory_order_acquire)) { /* nothing */ };
 }
 
 #if defined(__cplusplus)
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 449e2e41..5b31f6f3 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -222,8 +222,8 @@ typedef struct mi_page_s {
   uint32_t              xblock_size;       // size available in each block (always `>0`) 
 
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  volatile _Atomic(mi_thread_free_t) xthread_free;   // list of deferred free blocks freed by other threads
-  volatile _Atomic(uintptr_t)        xheap;
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
   
   struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
@@ -243,28 +243,28 @@ typedef enum mi_page_kind_e {
 // contain blocks.
 typedef struct mi_segment_s {
   // memory fields
-  size_t          memid;            // id for the os-level memory manager
-  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
-  bool            mem_is_committed; // `true` if the whole segment is eagerly committed
+  size_t               memid;            // id for the os-level memory manager
+  bool                 mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
+  bool                 mem_is_committed; // `true` if the whole segment is eagerly committed
 
   // segment fields
-  struct mi_segment_s* next;        // must be the first segment field -- see `segment.c:segment_alloc`
+  struct mi_segment_s* next;             // must be the first segment field -- see `segment.c:segment_alloc`
   struct mi_segment_s* prev;
   struct mi_segment_s* abandoned_next;
-  size_t          abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t          abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
 
-  size_t          used;        // count of pages in use (`used <= capacity`)
-  size_t          capacity;    // count of available pages (`#free + used`)
-  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
-  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t       cookie;      // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
+  size_t               used;             // count of pages in use (`used <= capacity`)
+  size_t               capacity;         // count of available pages (`#free + used`)
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
+  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
 
   // layout like this to optimize access in `mi_free`
-  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  volatile _Atomic(uintptr_t) thread_id;   // unique id of the thread owning this segment
-  mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
-  mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  _Atomic(uintptr_t)   thread_id;        // unique id of the thread owning this segment
+  mi_page_kind_t       page_kind;        // kind of pages: small, large, or huge
+  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
 
 
@@ -322,7 +322,7 @@ struct mi_heap_s {
   mi_tld_t*             tld;
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
   mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
-  volatile _Atomic(mi_block_t*) thread_delayed_free;
+  _Atomic(mi_block_t*)  thread_delayed_free;
   uintptr_t             thread_id;                           // thread this heap belongs too
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
diff --git a/src/alloc.c b/src/alloc.c
index 57034522..62c3c018 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -305,11 +305,10 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   }
 
   // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
   bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_read_relaxed(&page->xthread_free);
   do {
-    tfree = mi_atomic_read_relaxed(&page->xthread_free);
     use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
     if (mi_unlikely(use_delayed)) {
       // unlikely: this only happens on the first concurrent free in a page that is in the full list
@@ -320,7 +319,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_block_set_next(page, block, mi_tf_block(tfree));
       tfreex = mi_tf_set_block(tfree,block);
     }
-  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
 
   if (mi_unlikely(use_delayed)) {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
@@ -328,19 +327,19 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree;
+      mi_block_t* dfree = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
       do {
-        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
         mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, &dfree, block));
     }
 
     // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_read_relaxed(&page->xthread_free);
     do {
-      tfreex = tfree = mi_atomic_read_relaxed(&page->xthread_free);
+      tfreex = tfree;
       mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
       tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+    } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index bb9fc174..1c1fc1a0 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -63,7 +63,7 @@ typedef struct mi_arena_s {
   bool     is_zero_init;                  // is the arena zero initialized?
   bool     is_committed;                  // is the memory committed
   bool     is_large;                      // large OS page allocated
-  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
   mi_bitmap_field_t  blocks_inuse[1];       // in-place bitmap of in-use blocks (of size `field_count`)
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index 99e8fa6f..b9953a4f 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -30,7 +30,7 @@ and that the sequence must be smaller or equal to the bits in a field.
 #define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
 
 // An atomic bitmap of `uintptr_t` fields
-typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef _Atomic(uintptr_t)  mi_bitmap_field_t;
 typedef mi_bitmap_field_t*           mi_bitmap_t;
 
 // A bitmap index is the index of the bit in a bitmap.
@@ -123,7 +123,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f
 
   uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
   if ((field & mask) == 0) { // free?
-    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
+    if (mi_atomic_cas_strong(&bitmap[idx], &field, (field|mask))) {
       // claimed!
       return true;
     }
@@ -137,7 +137,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f
 static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
 {
   mi_assert_internal(bitmap_idx != NULL);
-  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
+  _Atomic(uintptr_t)* field = &bitmap[idx];
   uintptr_t map  = mi_atomic_read(field);
   if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
 
@@ -158,9 +158,8 @@ static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
       const uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(field);
+      if (!mi_atomic_cas_weak(field, &map, newmap)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
         continue;
       }
       else {
diff --git a/src/options.c b/src/options.c
index f29b387c..78c01456 100644
--- a/src/options.c
+++ b/src/options.c
@@ -217,7 +217,7 @@ static void mi_out_buf_stderr(const char* msg, void* arg) {
 // For now, don't register output from multiple threads.
 #pragma warning(suppress:4180)
 static mi_output_fun* volatile mi_out_default; // = NULL
-static volatile _Atomic(void*) mi_out_arg; // = NULL
+static _Atomic(void*) mi_out_arg; // = NULL
 
 static mi_output_fun* mi_out_get_default(void** parg) {
   if (parg != NULL) { *parg = mi_atomic_read_ptr(void,&mi_out_arg); }
@@ -241,7 +241,7 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
+static _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
 
 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
 // inside the C runtime causes another message.
@@ -339,7 +339,7 @@ void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, co
 // --------------------------------------------------------
 
 static mi_error_fun* volatile  mi_error_handler; // = NULL
-static volatile _Atomic(void*) mi_error_arg;     // = NULL
+static _Atomic(void*) mi_error_arg;     // = NULL
 
 static void mi_error_default(int err) {
   UNUSED(err);
diff --git a/src/os.c b/src/os.c
index 8079e5a0..29a76a88 100644
--- a/src/os.c
+++ b/src/os.c
@@ -266,7 +266,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
 
 static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
   mi_assert_internal(!(large_only && !allow_large));
-  static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
+  static _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
   if ((large_only || use_large_os_page(size, try_alignment))
       && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
@@ -274,7 +274,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
     if (!large_only && try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
       // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_strong(&large_page_try_ok, &try_ok, try_ok - 1);
     }
     else {
       // large OS pages must always reserve and commit.
@@ -360,14 +360,14 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   fd = VM_MAKE_TAG(os_tag);
   #endif
   if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
+    static _Atomic(uintptr_t) large_page_try_ok; // = 0;
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // If the OS is not configured for large OS pages, or the user does not have
       // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
       // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
       // to avoid too many failing calls to mmap.
-      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_strong(&large_page_try_ok, &try_ok, try_ok - 1);
     }
     else {
       int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
@@ -449,7 +449,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 // On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
-static volatile mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
+static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
 
 // Return a 4MiB aligned address that is probably available
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
@@ -462,7 +462,8 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
     uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
     init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF));  // (randomly 20 bits)*4MiB == 0 to 4TiB
     #endif
-    mi_atomic_cas_strong(&aligned_base, init, hint + size);
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong(&aligned_base, &expected, init);
     hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
   }
   if (hint%try_alignment != 0) return NULL;
@@ -969,9 +970,9 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 
   uintptr_t start = 0;
   uintptr_t end = 0;
-  uintptr_t expected;
+  uintptr_t huge_start = mi_atomic_read_relaxed(&mi_huge_start);
   do {
-    start = expected = mi_atomic_read_relaxed(&mi_huge_start);
+    start = huge_start;
     if (start == 0) {
       // Initialize the start address after the 32TiB area
       start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
@@ -982,7 +983,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     }
     end = start + size;
     mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected));
+  } while (!mi_atomic_cas_strong(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
diff --git a/src/page.c b/src/page.c
index c8a4e54b..6b92d4c9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -122,11 +122,11 @@ bool _mi_page_is_valid(mi_page_t* page) {
 #endif
 
 void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
   mi_delayed_t     old_delay;
+  mi_thread_free_t tfree;  
   do {
-    tfree = mi_atomic_read(&page->xthread_free);  // note: must acquire as we can break this loop and not do a CAS
+    tfree = mi_atomic_read(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
     tfreex = mi_tf_set_delayed(tfree, delay);
     old_delay = mi_tf_delayed(tfree);
     if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
@@ -140,7 +140,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
       break; // leave never-delayed flag set
     }
   } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+           !mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
 }
 
 /* -----------------------------------------------------------
@@ -154,13 +154,12 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
 static void _mi_page_thread_free_collect(mi_page_t* page)
 {
   mi_block_t* head;
-  mi_thread_free_t tfree;
   mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_read_relaxed(&page->xthread_free);
   do {
-    tfree = mi_atomic_read_relaxed(&page->xthread_free);
     head = mi_tf_block(tfree);
     tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
 
   // return if the list is empty
   if (head == NULL) return;
@@ -273,11 +272,9 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
    (put there by other threads if they deallocated in a full page)
 ----------------------------------------------------------- */
 void _mi_heap_delayed_free(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange is it is often NULL)
-  mi_block_t* block;
-  do {
-    block = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, NULL, block));
+  // take over the list (note: no atomic exchange since it is often NULL)
+  mi_block_t* block = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
 
   // and free them all
   while(block != NULL) {
@@ -286,11 +283,10 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
       // reset the delayed_freeing flag; in that case delay it further by reinserting.
-      mi_block_t* dfree;
+      mi_block_t* dfree = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
       do {
-        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
         mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, &dfree, block));
     }
     block = next;
   }
@@ -734,7 +730,7 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
 ----------------------------------------------------------- */
 
 static mi_deferred_free_fun* volatile deferred_free = NULL;
-static volatile _Atomic(void*) deferred_arg; // = NULL
+static _Atomic(void*) deferred_arg; // = NULL
 
 void _mi_deferred_free(mi_heap_t* heap, bool force) {
   heap->tld->heartbeat++;
diff --git a/src/random.c b/src/random.c
index 2a96ccf6..5c093a91 100644
--- a/src/random.c
+++ b/src/random.c
@@ -200,7 +200,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
   #ifndef GRND_NONBLOCK
   #define GRND_NONBLOCK (1)
   #endif
-  static volatile _Atomic(uintptr_t) no_getrandom; // = 0
+  static _Atomic(uintptr_t) no_getrandom; // = 0
   if (mi_atomic_read(&no_getrandom)==0) {
     ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
     if (ret >= 0) return (buf_len == (size_t)ret);
diff --git a/src/region.c b/src/region.c
index ae3a799a..d2904687 100644
--- a/src/region.c
+++ b/src/region.c
@@ -86,13 +86,13 @@ typedef union mi_region_info_u {
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(uintptr_t)        info;        // mi_region_info_t.value
-  volatile _Atomic(void*)            start;       // start of the memory area 
+  _Atomic(uintptr_t)        info;        // mi_region_info_t.value
+  _Atomic(void*)            start;       // start of the memory area 
   mi_bitmap_field_t                  in_use;      // bit per in-use block
   mi_bitmap_field_t                  dirty;       // track if non-zero per block
   mi_bitmap_field_t                  commit;      // track if committed per block
   mi_bitmap_field_t                  reset;       // track if reset per block
-  volatile _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
+  _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
   uintptr_t                          padding;     // round to 8 fields
 } mem_region_t;
 
@@ -100,7 +100,7 @@ typedef struct mem_region_s {
 static mem_region_t regions[MI_REGION_MAX];
 
 // Allocated regions
-static volatile _Atomic(uintptr_t) regions_count; // = 0;        
+static _Atomic(uintptr_t) regions_count; // = 0;        
 
 
 /* ----------------------------------------------------------------------------
@@ -447,10 +447,8 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
     mem_region_t* region = &regions[i];
     if (mi_atomic_read_relaxed(&region->info) != 0) {
       // if no segments used, try to claim the whole region
-      uintptr_t m;
-      do {
-        m = mi_atomic_read_relaxed(&region->in_use);
-      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
+      uintptr_t m = mi_atomic_read_relaxed(&region->in_use);
+      while (m == 0 && !mi_atomic_cas_weak(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
       if (m == 0) {
         // on success, free the whole region
         uint8_t* start = mi_atomic_read_ptr(uint8_t,&regions[i].start);
diff --git a/src/segment.c b/src/segment.c
index 8a5ba8c0..58c227bb 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -877,15 +877,15 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 // This is a list of visited abandoned pages that were full at the time.
 // this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
-static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
 
 // The abandoned page list (tagged as it supports pop)
-static mi_decl_cache_align volatile _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
 
 // We also maintain a count of current readers of the abandoned list
 // in order to prevent resetting/decommitting segment memory if it might
 // still be read.
-static mi_decl_cache_align volatile _Atomic(uintptr_t)           abandoned_readers; // = 0
+static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // = 0
 
 // Push on the visited list
 static void mi_abandoned_visited_push(mi_segment_t* segment) {
@@ -893,11 +893,10 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
   mi_assert_internal(segment->abandoned_next == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext;
+  mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited);
   do {
-    anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited);
     segment->abandoned_next = anext;
-  } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, segment, anext));
+  } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, &anext, segment));
 }
 
 // Move the visited list to the abandoned list.
@@ -911,11 +910,11 @@ static bool mi_abandoned_visited_revisit(void)
   if (first == NULL) return false;
 
   // first try to swap directly if the abandoned list happens to be NULL
-  const mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
   mi_tagged_segment_t afirst;
+  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
   if (mi_tagged_segment_ptr(ts)==NULL) {
     afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong(&abandoned, afirst, ts)) return true;
+    if (mi_atomic_cas_strong(&abandoned, &ts, afirst)) return true;
   }
 
   // find the last element of the visited list: O(n)
@@ -926,12 +925,11 @@ static bool mi_abandoned_visited_revisit(void)
 
   // and atomically prepend to the abandoned list
   // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext;
+  mi_tagged_segment_t anext = mi_atomic_read_relaxed(&abandoned);
   do {
-    anext = mi_atomic_read_relaxed(&abandoned);
     last->abandoned_next = mi_tagged_segment_ptr(anext);
     afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak(&abandoned, afirst, anext));
+  } while (!mi_atomic_cas_weak(&abandoned, &anext, afirst));
   return true;
 }
 
@@ -941,13 +939,12 @@ static void mi_abandoned_push(mi_segment_t* segment) {
   mi_assert_internal(segment->abandoned_next == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
-  mi_tagged_segment_t ts;
   mi_tagged_segment_t next;
+  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
   do {
-    ts = mi_atomic_read_relaxed(&abandoned);
     segment->abandoned_next = mi_tagged_segment_ptr(ts);
     next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak(&abandoned, next, ts));
+  } while (!mi_atomic_cas_weak(&abandoned, &ts, next));
 }
 
 // Wait until there are no more pending reads on segments that used to be in the abandoned list
@@ -977,13 +974,13 @@ static mi_segment_t* mi_abandoned_pop(void) {
   // (this is called from `memory.c:_mi_mem_free` for example)
   mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
   mi_tagged_segment_t next = 0;
+  ts = mi_atomic_read(&abandoned);
   do {
-    ts = mi_atomic_read(&abandoned);
     segment = mi_tagged_segment_ptr(ts);
     if (segment != NULL) {
       next = mi_tagged_segment(segment->abandoned_next, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
     }
-  } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, next, ts));
+  } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, &ts, next));
   mi_atomic_decrement(&abandoned_readers);  // release reader lock
   if (segment != NULL) {
     segment->abandoned_next = NULL;
@@ -1298,7 +1295,8 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
   // claim it and free
   mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
   // paranoia: if this it the last reference, the cas should always succeed
-  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
+  uintptr_t expected_tid = 0;
+  if (mi_atomic_cas_strong(&segment->thread_id, &expected_tid, heap->thread_id)) {
     mi_block_set_next(page, block, page->free);
     page->free = block;
     page->used--;
@@ -1315,6 +1313,11 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
     mi_segments_track_size((long)segment->segment_size, &tld->segments);
     _mi_segment_page_free(page, true, &tld->segments);
   }
+#if (MI_DEBUG!=0)
+  else {
+    mi_assert_internal(false);
+  }
+#endif
 }
 
 /* -----------------------------------------------------------
diff --git a/src/stats.c b/src/stats.c
index 172a3c0a..96f57a47 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -26,13 +26,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (mi_is_in_main(stat))
   {
     // add atomically (for abandoned pages)
-    mi_atomic_addi64(&stat->current,amount);
-    mi_atomic_maxi64(&stat->peak, mi_atomic_readi64(&stat->current));
+    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
     if (amount > 0) {
-      mi_atomic_addi64(&stat->allocated,amount);
+      mi_atomic_addi64_relaxed(&stat->allocated,amount);
     }
     else {
-      mi_atomic_addi64(&stat->freed, -amount);
+      mi_atomic_addi64_relaxed(&stat->freed, -amount);
     }
   }
   else {
@@ -50,8 +50,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
 
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
   if (mi_is_in_main(stat)) {
-    mi_atomic_addi64( &stat->count, 1 );
-    mi_atomic_addi64( &stat->total, (int64_t)amount );
+    mi_atomic_addi64_relaxed( &stat->count, 1 );
+    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
   }
   else {
     stat->count++;
@@ -71,17 +71,17 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
   if (src->allocated==0 && src->freed==0) return;
-  mi_atomic_addi64( &stat->allocated, src->allocated * unit);
-  mi_atomic_addi64( &stat->current, src->current * unit);
-  mi_atomic_addi64( &stat->freed, src->freed * unit);
+  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
+  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
   // peak scores do not work across threads..
-  mi_atomic_addi64( &stat->peak, src->peak * unit);
+  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
 }
 
 static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
   if (stat==src) return;
-  mi_atomic_addi64( &stat->total, src->total * unit);
-  mi_atomic_addi64( &stat->count, src->count * unit);
+  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
+  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
 }
 
 // must be thread safe as it is called from stats_merge

From 95afd0509face89d311830b4b13c6db1dec09685 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sat, 25 Jul 2020 23:50:22 -0700
Subject: [PATCH 03/11] make segment abandoned_next atomic; tsan passes without
 warnings now (issue #130)

---
 CMakeLists.txt           |  1 +
 include/mimalloc-types.h | 16 ++++++++++++++--
 src/segment.c            | 30 ++++++++++++++++--------------
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37616eb4..5a228036 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,7 @@ endif()
 if(MI_DEBUG_TSAN MATCHES "ON") 
   if(CMAKE_C_COMPILER_ID MATCHES "Clang")
     message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)")
+    list(APPEND mi_defines MI_TSAN=1)
     list(APPEND mi_cflags -fsanitize=thread -g -O1)
     list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=thread)
   else()
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 5b31f6f3..17b33bc6 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -155,6 +155,7 @@ typedef enum mi_delayed_e {
 
 // The `in_full` and `has_aligned` page flags are put in a union to efficiently
 // test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
 typedef union mi_page_flags_s {
   uint8_t full_aligned;
   struct {
@@ -162,6 +163,16 @@ typedef union mi_page_flags_s {
     uint8_t has_aligned : 1;
   } x;
 } mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
 
 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
@@ -245,12 +256,13 @@ typedef struct mi_segment_s {
   // memory fields
   size_t               memid;            // id for the os-level memory manager
   bool                 mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
-  bool                 mem_is_committed; // `true` if the whole segment is eagerly committed
+  bool                 mem_is_committed; // `true` if the whole segment is eagerly committed  
 
   // segment fields
   struct mi_segment_s* next;             // must be the first segment field -- see `segment.c:segment_alloc`
   struct mi_segment_s* prev;
-  struct mi_segment_s* abandoned_next;
+  _Atomic(struct mi_segment_s*) abandoned_next;
+
   size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
 
diff --git a/src/segment.c b/src/segment.c
index 58c227bb..5af98b1e 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -890,12 +890,12 @@ static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // =
 // Push on the visited list
 static void mi_abandoned_visited_push(mi_segment_t* segment) {
   mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
   mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited);
   do {
-    segment->abandoned_next = anext;
+    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, anext);
   } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, &anext, segment));
 }
 
@@ -903,7 +903,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 static bool mi_abandoned_visited_revisit(void)
 {
   // quick check if the visited list is empty
-  if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
+  if (mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
 
   // grab the whole visited list
   mi_segment_t* first = mi_atomic_exchange_ptr(mi_segment_t, &abandoned_visited, NULL);
@@ -919,15 +919,16 @@ static bool mi_abandoned_visited_revisit(void)
 
   // find the last element of the visited list: O(n)
   mi_segment_t* last = first;
-  while (last->abandoned_next != NULL) {
-    last = last->abandoned_next;
+  mi_segment_t* next;
+  while ((next = mi_atomic_read_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+    last = next;
   }
 
   // and atomically prepend to the abandoned list
   // (no need to increase the readers as we don't access the abandoned segments)
   mi_tagged_segment_t anext = mi_atomic_read_relaxed(&abandoned);
   do {
-    last->abandoned_next = mi_tagged_segment_ptr(anext);
+    mi_atomic_write_ptr(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
     afirst = mi_tagged_segment(first, anext);
   } while (!mi_atomic_cas_weak(&abandoned, &anext, afirst));
   return true;
@@ -936,13 +937,13 @@ static bool mi_abandoned_visited_revisit(void)
 // Push on the abandoned list.
 static void mi_abandoned_push(mi_segment_t* segment) {
   mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
   mi_tagged_segment_t next;
   mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
   do {
-    segment->abandoned_next = mi_tagged_segment_ptr(ts);
+    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
     next = mi_tagged_segment(segment, ts);
   } while (!mi_atomic_cas_weak(&abandoned, &ts, next));
 }
@@ -971,19 +972,20 @@ static mi_segment_t* mi_abandoned_pop(void) {
   // Do a pop. We use a reader count to prevent
   // a segment to be decommitted while a read is still pending,
   // and a tagged pointer to prevent A-B-A link corruption.
-  // (this is called from `memory.c:_mi_mem_free` for example)
+  // (this is called from `region.c:_mi_mem_free` for example)
   mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
   mi_tagged_segment_t next = 0;
   ts = mi_atomic_read(&abandoned);
   do {
     segment = mi_tagged_segment_ptr(ts);
     if (segment != NULL) {
-      next = mi_tagged_segment(segment->abandoned_next, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
+      mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
     }
   } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, &ts, next));
   mi_atomic_decrement(&abandoned_readers);  // release reader lock
   if (segment != NULL) {
-    segment->abandoned_next = NULL;
+    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, NULL);
   }
   return segment;
 }
@@ -995,7 +997,7 @@ static mi_segment_t* mi_abandoned_pop(void) {
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
   mi_assert_internal(segment->used > 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
   // remove the segment from the free page queue if needed
@@ -1008,8 +1010,8 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
   segment->thread_id = 0;
-  segment->abandoned_next = NULL;
   segment->abandoned_visits = 0;
+  mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, NULL);
   mi_abandoned_push(segment);
 }
 
@@ -1073,7 +1075,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
 
   segment->thread_id = _mi_thread_id();

From ebf951e851de13ecae6e92a6ad1657b61adefac5 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 00:15:57 -0700
Subject: [PATCH 04/11] extra checks for atomic ptr exchange; extend
 mi_atomic_yield for win32

---
 include/mimalloc-atomic.h | 108 ++++++++++++++++++++++++--------------
 1 file changed, 68 insertions(+), 40 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index beb0f12c..30d1e4f8 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -93,33 +93,58 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)* p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p,-sub);
 }
 
-// Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic).
-#define mi_atomic_read_ptr_relaxed(T,p)  \
-  (T*)(mi_atomic_read_relaxed((const _Atomic(uintptr_t)*)(p)))
-
-// Atomically read a pointer; Memory order is acquire.
-#define mi_atomic_read_ptr(T,p) \
-  (T*)(mi_atomic_read((const _Atomic(uintptr_t)*)(p)))
-
-// Atomically write a pointer; Memory order is acquire.
-#define mi_atomic_write_ptr(T,p,x) \
-  mi_atomic_write((_Atomic(uintptr_t)*)(p), (uintptr_t)((T*)x))
-
-
-static inline bool mi_atomic_cas_weak_voidp(_Atomic(void*)*p, void** expected, void* desired, void* unused) {
-  (void)(unused);
+// Atomically compare and exchange a void pointer; returns `true` if successful. May fail spuriously.
+// Memory order is release. (like a write)
+static inline bool mi_atomic_cas_weak_voidp(_Atomic(void*)* p, void** expected, void* desired, void* unused1, void* unused2) {
+  (void)unused1; (void)unused2;  // for extra type check
   return mi_atomic_cas_weak((_Atomic(uintptr_t)*)p, (uintptr_t*)expected, (uintptr_t)desired);
 }
 
+// Atomically read a void pointer; Memory order is relaxed (i.e. no fence, only atomic).
+static inline void* mi_atomic_read_voidp(const _Atomic(void*)* p, void* unused) {
+  (void)unused; // for extra type check
+  return (void*)mi_atomic_read((const _Atomic(uintptr_t)*) p);
+}
+
+// Atomically read a void pointer; Memory order is acquire.
+static inline void* mi_atomic_read_voidp_relaxed(const _Atomic(void*)*p, void* unused) {
+  (void)unused; // for extra type check
+  return (void*)mi_atomic_read_relaxed((const _Atomic(uintptr_t)*) p);
+}
+
+// Atomically write a void pointer; Memory order is acquire.
+static inline void mi_atomic_write_voidp(_Atomic(void*)* p, void* exchange, void* unused) {
+  (void)unused; // for extra type check
+  mi_atomic_write((_Atomic(uintptr_t)*) p, (uintptr_t)exchange);
+}
+
+// Atomically exchange a void pointer; Memory order is release-acquire.
+static inline void* mi_atomic_exchange_voidp(_Atomic(void*)*p, void* exchange, void* unused) {
+  (void)unused; // for extra type check
+  return (void*)mi_atomic_exchange((_Atomic(uintptr_t)*) p, (uintptr_t)exchange);
+}
+
 // Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
 // Memory order is release. (like a write)
 #define mi_atomic_cas_ptr_weak(T,p,expected,desired) \
-  mi_atomic_cas_weak_voidp((_Atomic(void*)*)(p), (void**)(expected), desired, *(expected))
-    
+  mi_atomic_cas_weak_voidp((_Atomic(void*)*)(p), (void**)(expected), desired, *(p), *(expected))
 
+// Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic).
+#define mi_atomic_read_ptr_relaxed(T,p)  \
+  (T*)(mi_atomic_read_voidp_relaxed((const _Atomic(void*)*)(p), *(p)))
+
+// Atomically read a pointer; Memory order is acquire.
+#define mi_atomic_read_ptr(T,p) \
+  (T*)(mi_atomic_read_voidp((const _Atomic(void*)*)(p), *(p)))
+
+// Atomically write a pointer; Memory order is acquire.
+#define mi_atomic_write_ptr(T,p,x) \
+  mi_atomic_write_voidp((_Atomic(void*)*)(p), x, *(p))
+    
 // Atomically exchange a pointer value.
 #define mi_atomic_exchange_ptr(T,p,exchange) \
-  (T*)mi_atomic_exchange((_Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange))
+  (T*)(mi_atomic_exchange_voidp((_Atomic(void*)*)(p), exchange, *(p)))
+
 
 
 #if !defined(__cplusplus) && defined(_MSC_VER)
@@ -171,9 +196,6 @@ static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x) {
   mi_atomic_exchange(p,x);
   #endif
 }
-static inline void mi_atomic_yield(void) {
-  YieldProcessor();
-}
 static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)* p, int64_t add) {
   #ifdef _WIN64
   return (int64_t)mi_atomic_addi((int64_t*)p,add);
@@ -246,35 +268,41 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
   int64_t current = atomic_load_explicit((_Atomic(int64_t)*)p, memory_order_relaxed);
   while (current < x && !atomic_compare_exchange_weak_explicit((_Atomic(int64_t)*)p, &current, x, memory_order_acq_rel, memory_order_acquire)) { /* nothing */ };
 }
+#endif
 
 #if defined(__cplusplus)
-  #include <thread>
-  static inline void mi_atomic_yield(void) {
-    std::this_thread::yield();
-  }
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
 #elif (defined(__GNUC__) || defined(__clang__)) && \
       (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
 #if defined(__x86_64__) || defined(__i386__)
-  static inline void mi_atomic_yield(void) {
-    asm volatile ("pause" ::: "memory");
-  }
+static inline void mi_atomic_yield(void) {
+  asm volatile ("pause" ::: "memory");
+}
 #elif defined(__arm__) || defined(__aarch64__)
-  static inline void mi_atomic_yield(void) {
-    asm volatile("yield");
-  }
+static inline void mi_atomic_yield(void) {
+  asm volatile("yield");
+}
 #endif
 #elif defined(__wasi__)
-  #include <sched.h>
-  static inline void mi_atomic_yield(void) {
-    sched_yield();
-  }
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
 #else
-  #include <unistd.h>
-  static inline void mi_atomic_yield(void) {
-    sleep(0);
-  }
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
 #endif
 
-#endif
 
 #endif // __MIMALLOC_ATOMIC_H

From 28014ee2bc049921effdd9a39fe983a43108cbdc Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 00:16:17 -0700
Subject: [PATCH 05/11] fix atomic access for MADV_FREE in os_reset

---
 src/os.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/os.c b/src/os.c
index 29a76a88..0b959a9c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -759,12 +759,12 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   if (p != start) return false;
 #else
 #if defined(MADV_FREE)
-  static int advice = MADV_FREE;
-  int err = madvise(start, csize, advice);
+  static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(MADV_FREE);
+  int err = madvise(start, csize, (int)mi_atomic_read_relaxed(&advice));
   if (err != 0 && errno == EINVAL && advice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
-    advice = MADV_DONTNEED;
-    err = madvise(start, csize, advice);
+    mi_atomic_write(&advice, MADV_DONTNEED);
+    err = madvise(start, csize, MADV_DONTNEED);
   }
 #elif defined(__wasi__)
   int err = 0;

From 53cbc68de3c65f90787641d2ab2c564a5662244f Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 00:21:10 -0700
Subject: [PATCH 06/11] display compiler in cmake summary

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a228036..98b55ae0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,6 +231,11 @@ endif()
 message(STATUS "")
 message(STATUS "Library base name: ${mi_basename}")
 message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+if(MI_USE_CXX MATCHES "ON")
+  message(STATUS "Compiler         : ${CMAKE_CXX_COMPILER}")
+else()
+  message(STATUS "Compiler         : ${CMAKE_C_COMPILER}")
+endif()
 message(STATUS "Install directory: ${mi_install_dir}")
 message(STATUS "Build targets    : ${mi_build_targets}")
 message(STATUS "")

From 116159cd40d64fa9e1e50a6c54dd322e2a482659 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 11:57:14 -0700
Subject: [PATCH 07/11] use RtlGenRandom on windows to enable compilation as
 C++ even with dynamic override

---
 src/random.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/random.c b/src/random.c
index 5c093a91..be95fc46 100644
--- a/src/random.c
+++ b/src/random.c
@@ -162,20 +162,29 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 -----------------------------------------------------------------------------*/
 
 #if defined(_WIN32)
+/*
+// We prefer BCryptGenRandom over RtlGenRandom but it leads to a crash a when using dynamic override combined with the C++ runtime :-( 
 #pragma comment (lib,"bcrypt.lib")
 #include <bcrypt.h>
 static bool os_random_buf(void* buf, size_t buf_len) {
   return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
-/*
-#define SystemFunction036 NTAPI SystemFunction036
-#include <NTSecAPI.h>
-#undef SystemFunction036
-static bool os_random_buf(void* buf, size_t buf_len) {
-  RtlGenRandom(buf, (ULONG)buf_len);
-  return true;
-}
 */
+#define RtlGenRandom  SystemFunction036
+#ifdef __cplusplus
+extern "C" {
+#endif
+BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+#ifdef __cplusplus
+}
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  mi_assert_internal(buf_len >= sizeof(uintptr_t));
+  memset(buf, 0, buf_len);
+  RtlGenRandom(buf, (ULONG)buf_len);
+  return (((uintptr_t*)buf)[0] != 0);  // sanity check (but RtlGenRandom should never fail)
+}
+
 #elif defined(ANDROID) || defined(XP_DARWIN) || defined(__APPLE__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
       defined(__sun) || defined(__wasi__)

From a9f46dc86f94e5a91eb3315ce2e8b9be6beea55a Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 11:58:02 -0700
Subject: [PATCH 08/11] reduce memory order constraints for better efficiency
 on ARM etc

---
 include/mimalloc-atomic.h | 27 +++++++++++++++++++--------
 src/page.c                |  2 +-
 src/segment.c             |  2 +-
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 30d1e4f8..b9935cb3 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -27,19 +27,23 @@ terms of the MIT license. A copy of the license can be found in the file
 // Atomic operations specialized for mimalloc
 // ------------------------------------------------------
 
-// Atomically add a value; returns the previous value. Memory ordering is acquire-release.
+// Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add);
 
-// Atomically "and" a value; returns the previous value. Memory ordering is acquire-release.
+// Atomically "and" a value; returns the previous value. Memory ordering is release.
 static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x);
 
-// Atomically "or" a value; returns the previous value. Memory ordering is acquire-release.
+// Atomically "or" a value; returns the previous value. Memory ordering is release.
 static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x);
 
 // Atomically compare and exchange a value; returns `true` if successful.
-// May fail spuriously. Memory ordering is acquire-release; with acquire on failure.
+// May fail spuriously. Memory ordering is release; with relaxed on failure.
 static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
 
+// Atomically compare and exchange a value; returns `true` if successful.
+// May fail spuriously. Memory ordering is acquire-release; with acquire on failure.
+static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired);
+
 // Atomically compare and exchange a value; returns `true` if successful.
 // Memory ordering is acquire-release; with acquire on failure.
 static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
@@ -180,6 +184,9 @@ static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expect
 static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
   return mi_atomic_cas_strong(p,expected,desired);
 }
+static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired) {
+  return mi_atomic_cas_strong(p, expected, desired);
+}
 static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange) {
   return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
@@ -225,17 +232,21 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
 #endif
 static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit(p, add, memory_order_acq_rel);
+  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
 static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
-  return atomic_fetch_and_explicit(p, x, memory_order_acq_rel);
+  return atomic_fetch_and_explicit(p, x, memory_order_release);
 }
 static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
-  return atomic_fetch_or_explicit(p, x, memory_order_acq_rel);
+  return atomic_fetch_or_explicit(p, x, memory_order_release);
 }
 static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
+  MI_USING_STD
+  return atomic_compare_exchange_weak_explicit(p, expected, desired, memory_order_release, memory_order_relaxed);
+}
+static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired) {
   MI_USING_STD
   return atomic_compare_exchange_weak_explicit(p, expected, desired, memory_order_acq_rel, memory_order_acquire);
 }
@@ -266,7 +277,7 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add)
 static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
   MI_USING_STD
   int64_t current = atomic_load_explicit((_Atomic(int64_t)*)p, memory_order_relaxed);
-  while (current < x && !atomic_compare_exchange_weak_explicit((_Atomic(int64_t)*)p, &current, x, memory_order_acq_rel, memory_order_acquire)) { /* nothing */ };
+  while (current < x && !atomic_compare_exchange_weak_explicit((_Atomic(int64_t)*)p, &current, x, memory_order_release, memory_order_relaxed)) { /* nothing */ };
 }
 #endif
 
diff --git a/src/page.c b/src/page.c
index 6b92d4c9..92faf9f2 100644
--- a/src/page.c
+++ b/src/page.c
@@ -159,7 +159,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   do {
     head = mi_tf_block(tfree);
     tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
 
   // return if the list is empty
   if (head == NULL) return;
diff --git a/src/segment.c b/src/segment.c
index 5af98b1e..55230553 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -982,7 +982,7 @@ static mi_segment_t* mi_abandoned_pop(void) {
       mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
       next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
     }
-  } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, &ts, next));
+  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
   mi_atomic_decrement(&abandoned_readers);  // release reader lock
   if (segment != NULL) {
     mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, NULL);

From a468430772a687085054e8380a94f794bd740f5c Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 14:19:30 -0700
Subject: [PATCH 09/11] strengthen memory order of bit operations; insert
 memory fences

---
 include/mimalloc-atomic.h | 6 +++---
 src/alloc.c               | 2 +-
 src/segment.c             | 8 +++++---
 test/test-stress.c        | 2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index b9935cb3..cb247b09 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -232,15 +232,15 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
 #endif
 static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
+  return atomic_fetch_add_explicit(p, add, memory_order_acq_rel);
 }
 static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
-  return atomic_fetch_and_explicit(p, x, memory_order_release);
+  return atomic_fetch_and_explicit(p, x, memory_order_acq_rel);
 }
 static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
-  return atomic_fetch_or_explicit(p, x, memory_order_release);
+  return atomic_fetch_or_explicit(p, x, memory_order_acq_rel);
 }
 static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
   MI_USING_STD
diff --git a/src/alloc.c b/src/alloc.c
index 62c3c018..e1c54bed 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -323,7 +323,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 
   if (mi_unlikely(use_delayed)) {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = mi_page_heap(page);
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_read(&page->xheap)); //mi_page_heap(page);
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
diff --git a/src/segment.c b/src/segment.c
index 55230553..b5fd13d3 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -472,7 +472,6 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   if (any_reset && mi_option_is_enabled(mi_option_reset_decommits)) {
     fully_committed = false;
   }
-  
   _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
 }
 
@@ -629,6 +628,7 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
         return NULL;  
       }
     }
+    atomic_thread_fence(memory_order_acq_rel);
     segment->memid = memid;
     segment->mem_is_fixed = mem_large;
     segment->mem_is_committed = commit;
@@ -638,6 +638,7 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
   mi_assert_internal(segment->mem_is_fixed ? segment->mem_is_committed : true);  
   if (!pages_still_good) {
     // zero the segment info (but not the `mem` fields)
+    atomic_thread_fence(memory_order_release);  // with read of `abandoned_next` in `mi_abandoned_pop`
     ptrdiff_t ofs = offsetof(mi_segment_t, next);
     memset((uint8_t*)segment + ofs, 0, info_size - ofs);
 
@@ -791,6 +792,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  atomic_thread_fence(memory_order_release);
   page->capacity = capacity;
   page->reserved = reserved;
   page->xblock_size = block_size;
@@ -801,7 +803,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
     mi_pages_reset_add(segment, page, tld);
   }
 
-  page->capacity = 0;  // after reset there can be zero'd now
+  page->capacity = 0;  // after reset these can be zero'd now
   page->reserved = 0;
 }
 
@@ -979,7 +981,7 @@ static mi_segment_t* mi_abandoned_pop(void) {
   do {
     segment = mi_tagged_segment_ptr(ts);
     if (segment != NULL) {
-      mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      mi_segment_t* anext = mi_atomic_read_ptr(mi_segment_t, &segment->abandoned_next);
       next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
     }
   } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
diff --git a/test/test-stress.c b/test/test-stress.c
index 7d8993a0..33ec674b 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -189,7 +189,7 @@ static void test_stress(void) {
       }
     }
     // mi_collect(false);
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
   }

From ef8e5d18a65f653bbef9cf57694aff37d2e85b9d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 26 Jul 2020 18:00:38 -0700
Subject: [PATCH 10/11] replace atomics with C11/C++ atomics with explicit
 memory order; passes tsan. Issue #130

---
 include/mimalloc-atomic.h   | 319 ++++++++++++++----------------------
 include/mimalloc-internal.h |   8 +-
 include/mimalloc-types.h    |   4 +-
 src/alloc.c                 |  14 +-
 src/arena.c                 |  18 +-
 src/bitmap.inc.c            |  14 +-
 src/heap.c                  |   2 +-
 src/options.c               |  18 +-
 src/os.c                    |  26 +--
 src/page-queue.c            |   6 +-
 src/page.c                  |  18 +-
 src/random.c                |   4 +-
 src/region.c                |  50 +++---
 src/segment.c               |  62 ++++---
 14 files changed, 248 insertions(+), 315 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index cb247b09..e1fdda16 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018, Microsoft Research, Daan Leijen
+Copyright (c) 2018,2020 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -8,150 +8,97 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_ATOMIC_H
 #define MIMALLOC_ATOMIC_H
 
-// ------------------------------------------------------
+// --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// ------------------------------------------------------
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. 
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types. 
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations 
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
 
 #if defined(__cplusplus)
+// Use C++ atomics
 #include <atomic>
-#define  _Atomic(tp)        std::atomic<tp>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
 #elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
 #define _Atomic(tp)         tp
 #define ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
 #else
+// Use C11 atomics
 #include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
 #endif
 
-// ------------------------------------------------------
-// Atomic operations specialized for mimalloc
-// ------------------------------------------------------
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
 
-// Atomically add a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add);
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
 
-// Atomically "and" a value; returns the previous value. Memory ordering is release.
-static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x);
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
-// Atomically "or" a value; returns the previous value. Memory ordering is release.
-static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x);
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 
-// Atomically compare and exchange a value; returns `true` if successful.
-// May fail spuriously. Memory ordering is release; with relaxed on failure.
-static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,1)
 
-// Atomically compare and exchange a value; returns `true` if successful.
-// May fail spuriously. Memory ordering is acquire-release; with acquire on failure.
-static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired);
-
-// Atomically compare and exchange a value; returns `true` if successful.
-// Memory ordering is acquire-release; with acquire on failure.
-static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired);
-
-// Atomically exchange a value. Memory ordering is acquire-release.
-static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange);
-
-// Atomically read a value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_read_relaxed(const _Atomic(uintptr_t)* p);
-
-// Atomically read a value. Memory ordering is acquire.
-static inline uintptr_t mi_atomic_read(const _Atomic(uintptr_t)* p);
-
-// Atomically write a value. Memory ordering is release.
-static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x);
-
-// Yield
 static inline void mi_atomic_yield(void);
-
-// Atomically add a 64-bit value; returns the previous value. Memory ordering is relaxed.
-// Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add);
-
-// Atomically update `*p` with the maximum of `*p` and `x` as a 64-bit value.
-// Returns the previous value. Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)* p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)* p, intptr_t sub);
 
 
-// Atomically subtract a value; returns the previous value.
-static inline uintptr_t mi_atomic_sub(_Atomic(uintptr_t)* p, uintptr_t sub) {
-  return mi_atomic_add(p, (uintptr_t)(-((intptr_t)sub)));
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorpic atomics so can use the typed `ptr` variants 
+// (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
 }
 
-// Atomically increment a value; returns the incremented result.
-static inline uintptr_t mi_atomic_increment(_Atomic(uintptr_t)* p) {
-  return mi_atomic_add(p, 1);
-}
 
-// Atomically decrement a value; returns the decremented result.
-static inline uintptr_t mi_atomic_decrement(_Atomic(uintptr_t)* p) {
-  return mi_atomic_sub(p, 1);
-}
+#elif defined(_MSC_VER)
 
-// Atomically add a signed value; returns the previous value.
-static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)mi_atomic_add((_Atomic(uintptr_t)*)p, (uintptr_t)add);
-}
-
-// Atomically subtract a signed value; returns the previous value.
-static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)* p, intptr_t sub) {
-  return (intptr_t)mi_atomic_addi(p,-sub);
-}
-
-// Atomically compare and exchange a void pointer; returns `true` if successful. May fail spuriously.
-// Memory order is release. (like a write)
-static inline bool mi_atomic_cas_weak_voidp(_Atomic(void*)* p, void** expected, void* desired, void* unused1, void* unused2) {
-  (void)unused1; (void)unused2;  // for extra type check
-  return mi_atomic_cas_weak((_Atomic(uintptr_t)*)p, (uintptr_t*)expected, (uintptr_t)desired);
-}
-
-// Atomically read a void pointer; Memory order is relaxed (i.e. no fence, only atomic).
-static inline void* mi_atomic_read_voidp(const _Atomic(void*)* p, void* unused) {
-  (void)unused; // for extra type check
-  return (void*)mi_atomic_read((const _Atomic(uintptr_t)*) p);
-}
-
-// Atomically read a void pointer; Memory order is acquire.
-static inline void* mi_atomic_read_voidp_relaxed(const _Atomic(void*)*p, void* unused) {
-  (void)unused; // for extra type check
-  return (void*)mi_atomic_read_relaxed((const _Atomic(uintptr_t)*) p);
-}
-
-// Atomically write a void pointer; Memory order is acquire.
-static inline void mi_atomic_write_voidp(_Atomic(void*)* p, void* exchange, void* unused) {
-  (void)unused; // for extra type check
-  mi_atomic_write((_Atomic(uintptr_t)*) p, (uintptr_t)exchange);
-}
-
-// Atomically exchange a void pointer; Memory order is release-acquire.
-static inline void* mi_atomic_exchange_voidp(_Atomic(void*)*p, void* exchange, void* unused) {
-  (void)unused; // for extra type check
-  return (void*)mi_atomic_exchange((_Atomic(uintptr_t)*) p, (uintptr_t)exchange);
-}
-
-// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
-// Memory order is release. (like a write)
-#define mi_atomic_cas_ptr_weak(T,p,expected,desired) \
-  mi_atomic_cas_weak_voidp((_Atomic(void*)*)(p), (void**)(expected), desired, *(p), *(expected))
-
-// Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic).
-#define mi_atomic_read_ptr_relaxed(T,p)  \
-  (T*)(mi_atomic_read_voidp_relaxed((const _Atomic(void*)*)(p), *(p)))
-
-// Atomically read a pointer; Memory order is acquire.
-#define mi_atomic_read_ptr(T,p) \
-  (T*)(mi_atomic_read_voidp((const _Atomic(void*)*)(p), *(p)))
-
-// Atomically write a pointer; Memory order is acquire.
-#define mi_atomic_write_ptr(T,p,x) \
-  mi_atomic_write_voidp((_Atomic(void*)*)(p), x, *(p))
-    
-// Atomically exchange a pointer value.
-#define mi_atomic_exchange_ptr(T,p,exchange) \
-  (T*)(mi_atomic_exchange_voidp((_Atomic(void*)*)(p), exchange, *(p)))
-
-
-
-#if !defined(__cplusplus) && defined(_MSC_VER)
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <intrin.h>
@@ -162,16 +109,29 @@ typedef LONG64   msc_intptr_t;
 typedef LONG     msc_intptr_t;
 #define MI_64(f) f
 #endif
-static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)* p, uintptr_t add, mi_memory_order mo) {
   return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
-static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) {
   return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) {
   return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
   uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
   if (read == *expected) {
     return true;
@@ -181,28 +141,36 @@ static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expect
     return false;
   }
 }
-static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
-  return mi_atomic_cas_strong(p,expected,desired);
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
 }
-static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired) {
-  return mi_atomic_cas_strong(p, expected, desired);
-}
-static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)* p, uintptr_t exchange, mi_memory_order mo) {
   return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read(_Atomic(uintptr_t) const* p) {
-  return *p;
+static inline mi_atomic_thread_fence(mi_memory_order mo) {
+  _Atomic(uintptr_t)x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
 }
-static inline uintptr_t mi_atomic_read_relaxed(_Atomic(uintptr_t) const* p) {
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  #if defined(_M_IX86) || defined(_M_X64)
   return *p;
+  #else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+  #endif
 }
-static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x) {
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) {
   #if defined(_M_IX86) || defined(_M_X64)
   *p = x;
   #else
-  mi_atomic_exchange(p,x);
+  mi_atomic_exchange_explicit(p,x,mo);
   #endif
 }
+
+// These are used by the statistics
 static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)* p, int64_t add) {
   #ifdef _WIN64
   return (int64_t)mi_atomic_addi((int64_t*)p,add);
@@ -216,7 +184,6 @@ static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)* p, int
   return current;
   #endif
 }
-
 static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
   int64_t current;
   do {
@@ -224,63 +191,31 @@ static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t
   } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }
 
-#else
-#ifdef __cplusplus
-#define  MI_USING_STD   using namespace std;
-#else
-#define  MI_USING_STD
-#endif
-static inline uintptr_t mi_atomic_add(_Atomic(uintptr_t)* p, uintptr_t add) {
-  MI_USING_STD
-  return atomic_fetch_add_explicit(p, add, memory_order_acq_rel);
-}
-static inline uintptr_t mi_atomic_and(_Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_fetch_and_explicit(p, x, memory_order_acq_rel);
-}
-static inline uintptr_t mi_atomic_or(_Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_fetch_or_explicit(p, x, memory_order_acq_rel);
-}
-static inline bool mi_atomic_cas_weak(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
-  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit(p, expected, desired, memory_order_release, memory_order_relaxed);
-}
-static inline bool mi_atomic_cas_weak_acq_rel(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired) {
-  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit(p, expected, desired, memory_order_acq_rel, memory_order_acquire);
-}
-static inline bool mi_atomic_cas_strong(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired) {
-  MI_USING_STD
-  return atomic_compare_exchange_strong_explicit(p, expected, desired, memory_order_acq_rel, memory_order_acquire);
-}
-static inline uintptr_t mi_atomic_exchange(_Atomic(uintptr_t)* p, uintptr_t exchange) {
-  MI_USING_STD
-  return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
-}
-static inline uintptr_t mi_atomic_read_relaxed(const _Atomic(uintptr_t)* p) {
-  MI_USING_STD
-  return atomic_load_explicit((_Atomic(uintptr_t)*) p, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_read(const _Atomic(uintptr_t)* p) {
-  MI_USING_STD
-  return atomic_load_explicit((_Atomic(uintptr_t)*) p, memory_order_acquire);
-}
-static inline void mi_atomic_write(_Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_store_explicit(p, x, memory_order_release);
-}
-static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
-  MI_USING_STD
-  return atomic_fetch_add_explicit((_Atomic(int64_t)*)p, add, memory_order_relaxed);
-}
-static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
-  MI_USING_STD
-  int64_t current = atomic_load_explicit((_Atomic(int64_t)*)p, memory_order_relaxed);
-  while (current < x && !atomic_compare_exchange_weak_explicit((_Atomic(int64_t)*)p, &current, x, memory_order_release, memory_order_relaxed)) { /* nothing */ };
-}
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
 #endif
 
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+// Yield 
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 2dc7e36a..1afdae9c 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -448,21 +448,21 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
 
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
 }
 
 static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_read_relaxed(&page->xthread_free) & 3);
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
 }
 
 // Heap access
 static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_read_relaxed(&page->xheap));
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
 }
 
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_write(&page->xheap,(uintptr_t)heap);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
 }
 
 // Thread free flag helpers
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 17b33bc6..18c415eb 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -259,9 +259,9 @@ typedef struct mi_segment_s {
   bool                 mem_is_committed; // `true` if the whole segment is eagerly committed  
 
   // segment fields
-  struct mi_segment_s* next;             // must be the first segment field -- see `segment.c:segment_alloc`
-  struct mi_segment_s* prev;
   _Atomic(struct mi_segment_s*) abandoned_next;
+  struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
+  struct mi_segment_s* prev;
 
   size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
diff --git a/src/alloc.c b/src/alloc.c
index e1c54bed..ebf90ebc 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -307,7 +307,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   // Try to put the block on either the page-local thread free list, or the heap delayed free list.
   mi_thread_free_t tfreex;
   bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_read_relaxed(&page->xthread_free);
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
   do {
     use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
     if (mi_unlikely(use_delayed)) {
@@ -319,27 +319,27 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_block_set_next(page, block, mi_tf_block(tfree));
       tfreex = mi_tf_set_block(tfree,block);
     }
-  } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
 
   if (mi_unlikely(use_delayed)) {
     // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_read(&page->xheap)); //mi_page_heap(page);
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
     mi_assert_internal(heap != NULL);
     if (heap != NULL) {
       // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
       do {
         mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
     }
 
     // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_read_relaxed(&page->xthread_free);
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
     do {
       tfreex = tfree;
       mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
       tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index 1c1fc1a0..73a7e704 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -105,12 +105,12 @@ static size_t mi_block_count_of_size(size_t size) {
 static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
 {
   const size_t fcount = arena->field_count;
-  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
+  size_t idx = mi_atomic_load_acquire(&arena->search_idx);  // start from last search
   for (size_t visited = 0; visited < fcount; visited++, idx++) {
     if (idx >= fcount) idx = 0;  // wrap around
     // try to atomically claim a range of bits
     if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
-      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
+      mi_atomic_store_release(&arena->search_idx, idx);  // start search from here next time
       return true;
     }
   }
@@ -175,7 +175,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
     // try numa affine allocation
     for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
       if (arena==NULL) break; // end reached
       if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
           (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
@@ -187,7 +187,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
     }
     // try from another numa node instead..
     for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
       if (arena==NULL) break; // end reached
       if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
           (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
@@ -228,7 +228,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s
     size_t bitmap_idx;
     mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
     mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -254,15 +254,15 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s
 
 static bool mi_arena_add(mi_arena_t* arena) {
   mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_read_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
 
-  uintptr_t i = mi_atomic_increment(&mi_arena_count);
+  uintptr_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
   if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement(&mi_arena_count);
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
-  mi_atomic_write_ptr(mi_arena_t,&mi_arenas[i], arena);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
   return true;
 }
 
diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c
index b9953a4f..2d6df46e 100644
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@@ -121,9 +121,9 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);
 
-  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]);
   if ((field & mask) == 0) { // free?
-    if (mi_atomic_cas_strong(&bitmap[idx], &field, (field|mask))) {
+    if (mi_atomic_cas_strong_acq_rel(&bitmap[idx], &field, (field|mask))) {
       // claimed!
       return true;
     }
@@ -138,7 +138,7 @@ static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx
 {
   mi_assert_internal(bitmap_idx != NULL);
   _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map  = mi_atomic_read(field);
+  uintptr_t map  = mi_atomic_load_relaxed(field);
   if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
 
   // search for 0-bit sequence of length count
@@ -158,7 +158,7 @@ static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
       const uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(field, &map, newmap)) {  // TODO: use strong cas here?
+      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
         // no success, another thread claimed concurrently.. keep going (with updated `map`)
         continue;
       }
@@ -204,7 +204,7 @@ static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, s
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   // mi_assert_internal((bitmap[idx] & mask) == mask);
-  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
+  uintptr_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
   return ((prev & mask) == mask);
 }
 
@@ -217,7 +217,7 @@ static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, siz
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
   //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  uintptr_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
   if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
   return ((prev & mask) == 0);
 }
@@ -228,7 +228,7 @@ static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_field
   const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
   const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
   mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]);
   if (any_ones != NULL) *any_ones = ((field & mask) != 0);
   return ((field & mask) == mask);
 }
diff --git a/src/heap.c b/src/heap.c
index 5d0d4b8a..526c93ed 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -143,7 +143,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_read_ptr(mi_block_t,&heap->thread_delayed_free) == NULL );
+  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
   // collect segment caches
   if (collect >= MI_FORCE) {
diff --git a/src/options.c b/src/options.c
index 78c01456..85cbf7f6 100644
--- a/src/options.c
+++ b/src/options.c
@@ -173,11 +173,11 @@ static _Atomic(uintptr_t) out_len;
 static void mi_out_buf(const char* msg, void* arg) {
   UNUSED(arg);
   if (msg==NULL) return;
-  if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
   size_t n = strlen(msg);
   if (n==0) return;
   // claim space
-  uintptr_t start = mi_atomic_add(&out_len, n);
+  uintptr_t start = mi_atomic_add_acq_rel(&out_len, n);
   if (start >= MI_MAX_DELAY_OUTPUT) return;
   // check bound
   if (start+n >= MI_MAX_DELAY_OUTPUT) {
@@ -189,7 +189,7 @@ static void mi_out_buf(const char* msg, void* arg) {
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
   if (out==NULL) return;
   // claim (if `no_more_buf == true`, no more output will be added after this point)
-  size_t count = mi_atomic_add(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
   // and output the current contents
   if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
   out_buf[count] = 0;
@@ -220,14 +220,14 @@ static mi_output_fun* volatile mi_out_default; // = NULL
 static _Atomic(void*) mi_out_arg; // = NULL
 
 static mi_output_fun* mi_out_get_default(void** parg) {
-  if (parg != NULL) { *parg = mi_atomic_read_ptr(void,&mi_out_arg); }
+  if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
   mi_output_fun* out = mi_out_default;
   return (out == NULL ? &mi_out_buf : out);
 }
 
 void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
   mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  mi_atomic_write_ptr(void,&mi_out_arg, arg);
+  mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
   if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
 }
 
@@ -313,13 +313,13 @@ void _mi_verbose_message(const char* fmt, ...) {
 
 static void mi_show_error_message(const char* fmt, va_list args) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
   mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }
 
 void _mi_warning_message(const char* fmt, ...) {
   if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
   va_list args;
   va_start(args,fmt);
   mi_vfprintf(NULL, NULL, "mimalloc: warning: ", fmt, args);
@@ -365,7 +365,7 @@ static void mi_error_default(int err) {
 
 void mi_register_error(mi_error_fun* fun, void* arg) {
   mi_error_handler = fun;  // can be NULL
-  mi_atomic_write_ptr(void,&mi_error_arg, arg);
+  mi_atomic_store_ptr_release(void,&mi_error_arg, arg);
 }
 
 void _mi_error_message(int err, const char* fmt, ...) {
@@ -376,7 +376,7 @@ void _mi_error_message(int err, const char* fmt, ...) {
   va_end(args);
   // and call the error handler which may abort (or return normally)
   if (mi_error_handler != NULL) {
-    mi_error_handler(err, mi_atomic_read_ptr(void,&mi_error_arg));
+    mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg));
   }
   else {
     mi_error_default(err);
diff --git a/src/os.c b/src/os.c
index 0b959a9c..8d0c8237 100644
--- a/src/os.c
+++ b/src/os.c
@@ -270,11 +270,11 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   void* p = NULL;
   if ((large_only || use_large_os_page(size, try_alignment))
       && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
       // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_strong(&large_page_try_ok, &try_ok, try_ok - 1);
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
     }
     else {
       // large OS pages must always reserve and commit.
@@ -283,7 +283,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
       if (large_only) return p;
       // fall back to non-large page allocation on error (`p == NULL`).
       if (p == NULL) {
-        mi_atomic_write(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
+        mi_atomic_store_release(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
       }
     }
   }
@@ -361,13 +361,13 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   #endif
   if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
     static _Atomic(uintptr_t) large_page_try_ok; // = 0;
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
       // If the OS is not configured for large OS pages, or the user does not have
       // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
       // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
       // to avoid too many failing calls to mmap.
-      mi_atomic_cas_strong(&large_page_try_ok, &try_ok, try_ok - 1);
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
     }
     else {
       int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
@@ -407,7 +407,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
         #endif
         if (large_only) return p;
         if (p == NULL) {
-          mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
+          mi_atomic_store_release(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
         }
       }
     }
@@ -455,7 +455,7 @@ static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
   if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
-  uintptr_t hint = mi_atomic_add(&aligned_base, size);
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
   if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
     uintptr_t init = ((uintptr_t)4 << 40); // start at 4TiB area
     #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
@@ -463,8 +463,8 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
     init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF));  // (randomly 20 bits)*4MiB == 0 to 4TiB
     #endif
     uintptr_t expected = hint + size;
-    mi_atomic_cas_strong(&aligned_base, &expected, init);
-    hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
   }
   if (hint%try_alignment != 0) return NULL;
   return (void*)hint;
@@ -760,10 +760,10 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 #else
 #if defined(MADV_FREE)
   static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(MADV_FREE);
-  int err = madvise(start, csize, (int)mi_atomic_read_relaxed(&advice));
+  int err = madvise(start, csize, (int)mi_atomic_load_relaxed(&advice));
   if (err != 0 && errno == EINVAL && advice == MADV_FREE) {
     // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
-    mi_atomic_write(&advice, MADV_DONTNEED);
+    mi_atomic_store_release(&advice, MADV_DONTNEED);
     err = madvise(start, csize, MADV_DONTNEED);
   }
 #elif defined(__wasi__)
@@ -970,7 +970,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 
   uintptr_t start = 0;
   uintptr_t end = 0;
-  uintptr_t huge_start = mi_atomic_read_relaxed(&mi_huge_start);
+  uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
   do {
     start = huge_start;
     if (start == 0) {
@@ -983,7 +983,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     }
     end = start + size;
     mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong(&mi_huge_start, &huge_start, end));
+  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
   return (uint8_t*)start;
diff --git a/src/page-queue.c b/src/page-queue.c
index ea213019..37719e02 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -260,7 +260,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   heap->page_count--;
   page->next = NULL;
   page->prev = NULL;
-  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
   mi_page_set_in_full(page,false);
 }
 
@@ -274,7 +274,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
   page->next = queue->first;
   page->prev = NULL;
   if (queue->first != NULL) {
@@ -341,7 +341,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
   for (mi_page_t* page = append->first; page != NULL; page = page->next) {
     // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
     // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_write(&page->xheap, (uintptr_t)heap); 
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
     // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
     // side effect that it spins until any DELAYED_FREEING is finished. This ensures
     // that after appending only the new heap will be used for delayed free operations.
diff --git a/src/page.c b/src/page.c
index 92faf9f2..cd96bb90 100644
--- a/src/page.c
+++ b/src/page.c
@@ -126,7 +126,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
   mi_delayed_t     old_delay;
   mi_thread_free_t tfree;  
   do {
-    tfree = mi_atomic_read(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
+    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
     tfreex = mi_tf_set_delayed(tfree, delay);
     old_delay = mi_tf_delayed(tfree);
     if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
@@ -140,7 +140,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
       break; // leave never-delayed flag set
     }
   } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak(&page->xthread_free, &tfree, tfreex));
+           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
 }
 
 /* -----------------------------------------------------------
@@ -155,7 +155,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
 {
   mi_block_t* head;
   mi_thread_free_t tfreex;
-  mi_thread_free_t tfree = mi_atomic_read_relaxed(&page->xthread_free);
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
   do {
     head = mi_tf_block(tfree);
     tfreex = mi_tf_set_block(tfree,NULL);
@@ -273,8 +273,8 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
 ----------------------------------------------------------- */
 void _mi_heap_delayed_free(mi_heap_t* heap) {
   // take over the list (note: no atomic exchange since it is often NULL)
-  mi_block_t* block = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-  while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
+  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
 
   // and free them all
   while(block != NULL) {
@@ -283,10 +283,10 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
     if (!_mi_free_delayed_block(block)) {
       // we might already start delayed freeing while another thread has not yet
       // reset the delayed_freeing flag; in that case delay it further by reinserting.
-      mi_block_t* dfree = mi_atomic_read_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
       do {
         mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
     }
     block = next;
   }
@@ -736,14 +736,14 @@ void _mi_deferred_free(mi_heap_t* heap, bool force) {
   heap->tld->heartbeat++;
   if (deferred_free != NULL && !heap->tld->recurse) {
     heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(void,&deferred_arg));
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
     heap->tld->recurse = false;
   }
 }
 
 void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
   deferred_free = fn;
-  mi_atomic_write_ptr(void,&deferred_arg, arg);
+  mi_atomic_store_ptr_release(void,&deferred_arg, arg);
 }
 
 
diff --git a/src/random.c b/src/random.c
index be95fc46..836f83a2 100644
--- a/src/random.c
+++ b/src/random.c
@@ -210,11 +210,11 @@ static bool os_random_buf(void* buf, size_t buf_len) {
   #define GRND_NONBLOCK (1)
   #endif
   static _Atomic(uintptr_t) no_getrandom; // = 0
-  if (mi_atomic_read(&no_getrandom)==0) {
+  if (mi_atomic_load_acquire(&no_getrandom)==0) {
     ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
     if (ret >= 0) return (buf_len == (size_t)ret);
     if (ret != ENOSYS) return false;
-    mi_atomic_write(&no_getrandom,1); // don't call again, and fall back to /dev/urandom
+    mi_atomic_store_release(&no_getrandom,1); // don't call again, and fall back to /dev/urandom
   }
 #endif
   int flags = O_RDONLY;
diff --git a/src/region.c b/src/region.c
index d2904687..e916e452 100644
--- a/src/region.c
+++ b/src/region.c
@@ -123,9 +123,9 @@ static size_t mi_good_commit_size(size_t size) {
 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   if (p==NULL) return false;
-  size_t count = mi_atomic_read_relaxed(&regions_count);
+  size_t count = mi_atomic_load_relaxed(&regions_count);
   for (size_t i = 0; i < count; i++) {
-    uint8_t* start = mi_atomic_read_ptr_relaxed(uint8_t,&regions[i].start);
+    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
     if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
   }
   return false;
@@ -133,7 +133,7 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
 
 
 static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
   mi_assert_internal(start != NULL);
   return (start + (bit_idx * MI_SEGMENT_SIZE));  
 }
@@ -171,7 +171,7 @@ static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_
 static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
   // not out of regions yet?
-  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
+  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
 
   // try to allocate a fresh region from the OS
   bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
@@ -184,9 +184,9 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   mi_assert_internal(!region_large || region_commit);
 
   // claim a fresh slot
-  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  const uintptr_t idx = mi_atomic_increment_acq_rel(&regions_count);
   if (idx >= MI_REGION_MAX) {
-    mi_atomic_decrement(&regions_count);
+    mi_atomic_decrement_acq_rel(&regions_count);
     _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
     _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, GiB));
     return false;
@@ -195,13 +195,13 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   // allocated, initialize and claim the initial blocks
   mem_region_t* r = &regions[idx];
   r->arena_memid  = arena_memid;
-  mi_atomic_write(&r->in_use, 0);
-  mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
-  mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_write(&r->reset, 0);
+  mi_atomic_store_release(&r->in_use, 0);
+  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_store_release(&r->reset, 0);
   *bit_idx = 0;
   mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_write_ptr(uint8_t*,&r->start, start);
+  mi_atomic_store_ptr_release(uint8_t*,&r->start, start);
 
   // and share it 
   mi_region_info_t info;
@@ -209,7 +209,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
   info.x.valid = true;
   info.x.is_large = region_large;
   info.x.numa_node = (short)_mi_os_numa_node(tld);
-  mi_atomic_write(&r->info, info.value); // now make it available to others
+  mi_atomic_store_release(&r->info, info.value); // now make it available to others
   *region = r;
   return true;
 }
@@ -221,7 +221,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
 static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
   // initialized at all?
   mi_region_info_t info;
-  info.value = mi_atomic_read_relaxed(&region->info);
+  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
   if (info.value==0) return false;
 
   // numa correct
@@ -240,7 +240,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
 static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
   // try all regions for a free slot  
-  const size_t count = mi_atomic_read(&regions_count);
+  const size_t count = mi_atomic_load_acquire(&regions_count);
   size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
@@ -280,8 +280,8 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
   mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
 
   mi_region_info_t info;
-  info.value = mi_atomic_read(&region->info);
-  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
+  info.value = mi_atomic_load_acquire(&region->info);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
   mi_assert_internal(!(info.x.is_large && !*is_large));
   mi_assert_internal(start != NULL);
 
@@ -400,7 +400,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
     const size_t blocks = mi_region_block_count(size);
     mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
     mi_region_info_t info;
-    info.value = mi_atomic_read(&region->info);
+    info.value = mi_atomic_load_acquire(&region->info);
     mi_assert_internal(info.value != 0);
     void* blocks_start = mi_region_blocks_start(region, bit_idx);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
@@ -442,21 +442,21 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
 -----------------------------------------------------------------------------*/
 void _mi_mem_collect(mi_os_tld_t* tld) {
   // free every region that has no segments in use.
-  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  uintptr_t rcount = mi_atomic_load_relaxed(&regions_count);
   for (size_t i = 0; i < rcount; i++) {
     mem_region_t* region = &regions[i];
-    if (mi_atomic_read_relaxed(&region->info) != 0) {
+    if (mi_atomic_load_relaxed(&region->info) != 0) {
       // if no segments used, try to claim the whole region
-      uintptr_t m = mi_atomic_read_relaxed(&region->in_use);
-      while (m == 0 && !mi_atomic_cas_weak(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
+      uintptr_t m = mi_atomic_load_relaxed(&region->in_use);
+      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
       if (m == 0) {
         // on success, free the whole region
-        uint8_t* start = mi_atomic_read_ptr(uint8_t,&regions[i].start);
-        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
-        uintptr_t commit = mi_atomic_read_relaxed(&regions[i].commit);
+        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
+        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
+        uintptr_t commit = mi_atomic_load_relaxed(&regions[i].commit);
         memset(&regions[i], 0, sizeof(mem_region_t));
         // and release the whole region
-        mi_atomic_write(&region->info, 0);
+        mi_atomic_store_release(&region->info, 0);
         if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
           _mi_abandoned_await_readers(); // ensure no pending reads
           _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
diff --git a/src/segment.c b/src/segment.c
index b5fd13d3..2416dadd 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -628,17 +628,16 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
         return NULL;  
       }
     }
-    atomic_thread_fence(memory_order_acq_rel);
     segment->memid = memid;
     segment->mem_is_fixed = mem_large;
-    segment->mem_is_committed = commit;
+    segment->mem_is_committed = commit;    
     mi_segments_track_size((long)segment_size, tld);
   }
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
   mi_assert_internal(segment->mem_is_fixed ? segment->mem_is_committed : true);  
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
   if (!pages_still_good) {
     // zero the segment info (but not the `mem` fields)
-    atomic_thread_fence(memory_order_release);  // with read of `abandoned_next` in `mi_abandoned_pop`
     ptrdiff_t ofs = offsetof(mi_segment_t, next);
     memset((uint8_t*)segment + ofs, 0, info_size - ofs);
 
@@ -792,7 +791,6 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
   uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
-  atomic_thread_fence(memory_order_release);
   page->capacity = capacity;
   page->reserved = reserved;
   page->xblock_size = block_size;
@@ -892,69 +890,69 @@ static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // =
 // Push on the visited list
 static void mi_abandoned_visited_push(mi_segment_t* segment) {
   mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited);
+  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
   do {
-    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, anext);
-  } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, &anext, segment));
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
+  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
 }
 
 // Move the visited list to the abandoned list.
 static bool mi_abandoned_visited_revisit(void)
 {
   // quick check if the visited list is empty
-  if (mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
+  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
 
   // grab the whole visited list
-  mi_segment_t* first = mi_atomic_exchange_ptr(mi_segment_t, &abandoned_visited, NULL);
+  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
   if (first == NULL) return false;
 
   // first try to swap directly if the abandoned list happens to be NULL
   mi_tagged_segment_t afirst;
-  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
   if (mi_tagged_segment_ptr(ts)==NULL) {
     afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong(&abandoned, &ts, afirst)) return true;
+    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) return true;
   }
 
   // find the last element of the visited list: O(n)
   mi_segment_t* last = first;
   mi_segment_t* next;
-  while ((next = mi_atomic_read_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
     last = next;
   }
 
   // and atomically prepend to the abandoned list
   // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext = mi_atomic_read_relaxed(&abandoned);
+  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
   do {
-    mi_atomic_write_ptr(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
+    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
     afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak(&abandoned, &anext, afirst));
+  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
   return true;
 }
 
 // Push on the abandoned list.
 static void mi_abandoned_push(mi_segment_t* segment) {
   mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_internal(segment->used > 0);
   mi_tagged_segment_t next;
-  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
   do {
-    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
     next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak(&abandoned, &ts, next));
+  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
 }
 
 // Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
   uintptr_t n;
   do {
-    n = mi_atomic_read(&abandoned_readers);
+    n = mi_atomic_load_acquire(&abandoned_readers);
     if (n != 0) mi_atomic_yield();
   } while (n != 0);
 }
@@ -963,7 +961,7 @@ void _mi_abandoned_await_readers(void) {
 static mi_segment_t* mi_abandoned_pop(void) {
   mi_segment_t* segment;
   // Check efficiently if it is empty (or if the visited list needs to be moved)
-  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
   segment = mi_tagged_segment_ptr(ts);
   if (mi_likely(segment == NULL)) {
     if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
@@ -975,19 +973,19 @@ static mi_segment_t* mi_abandoned_pop(void) {
   // a segment to be decommitted while a read is still pending,
   // and a tagged pointer to prevent A-B-A link corruption.
   // (this is called from `region.c:_mi_mem_free` for example)
-  mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
+  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
   mi_tagged_segment_t next = 0;
-  ts = mi_atomic_read(&abandoned);
+  ts = mi_atomic_load_acquire(&abandoned);
   do {
     segment = mi_tagged_segment_ptr(ts);
     if (segment != NULL) {
-      mi_segment_t* anext = mi_atomic_read_ptr(mi_segment_t, &segment->abandoned_next);
+      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
       next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
     }
   } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
-  mi_atomic_decrement(&abandoned_readers);  // release reader lock
+  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
   if (segment != NULL) {
-    mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, NULL);
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
   }
   return segment;
 }
@@ -999,7 +997,7 @@ static mi_segment_t* mi_abandoned_pop(void) {
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
   mi_assert_internal(segment->used > 0);
-  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
   // remove the segment from the free page queue if needed
@@ -1013,7 +1011,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_segments_track_size(-((long)segment->segment_size), tld);
   segment->thread_id = 0;
   segment->abandoned_visits = 0;
-  mi_atomic_write_ptr(mi_segment_t, &segment->abandoned_next, NULL);
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
   mi_abandoned_push(segment);
 }
 
@@ -1077,7 +1075,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
 
   segment->thread_id = _mi_thread_id();
@@ -1294,13 +1292,13 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
   // huge page segments are always abandoned and can be freed immediately by any thread
   mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
   mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
 
   // claim it and free
   mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
   // paranoia: if this it the last reference, the cas should always succeed
   uintptr_t expected_tid = 0;
-  if (mi_atomic_cas_strong(&segment->thread_id, &expected_tid, heap->thread_id)) {
+  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
     mi_block_set_next(page, block, page->free);
     page->free = block;
     page->used--;

From 76a68cd7af539625dea3ce349aa7742bc02e1ebc Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Thu, 3 Sep 2020 09:45:53 -0700
Subject: [PATCH 11/11] bump version to 1.6.6 with new atomics

---
 include/mimalloc-atomic.h | 4 ++--
 include/mimalloc.h        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index e1fdda16..e3e3186d 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -25,8 +25,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #define  mi_memory_order(name)  std::memory_order_##name
 #elif defined(_MSC_VER)
 // Use MSVC C wrapper for C11 atomics
-#define _Atomic(tp)         tp
-#define ATOMIC_VAR_INIT(x)  x
+#define  _Atomic(tp)            tp
+#define  ATOMIC_VAR_INIT(x)     x
 #define  mi_atomic(name)        mi_atomic_##name
 #define  mi_memory_order(name)  mi_memory_order_##name
 #else
diff --git a/include/mimalloc.h b/include/mimalloc.h
index f44f6d9a..4b0a911f 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 164   // major + 2 digits minor
+#define MI_MALLOC_VERSION 166   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes