merge from dev

2025-08-23 15:54:47 +03:00 · 2019-08-29 09:52:22 -07:00 · 2019-08-29 09:52:22 -07:00 · f35f643508
commit f35f643508
parent 71121a7457 7bf12c7b5f
23 changed files with 1739 additions and 494 deletions
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -9,116 +9,150 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MIMALLOC_ATOMIC_H

 // ------------------------------------------------------
-// Atomics
+// Atomics 
+// We need to be portable between C, C++, and MSVC.
 // ------------------------------------------------------

-// Atomically increment a value; returns the incremented result.
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p);
+#if defined(_MSC_VER)
+#define _Atomic(tp)         tp
+#define ATOMIC_VAR_INIT(x)  x
+#elif defined(__cplusplus)
+#include <atomic>
+#define  _Atomic(tp)        std::atomic<tp>
+#else
+#include <stdatomic.h>
+#endif

-// Atomically increment a value; returns the incremented result.
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p);
+#define mi_atomic_cast(tp,x)  (volatile _Atomic(tp)*)(x)

-// Atomically decrement a value; returns the decremented result.
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p);
+// ------------------------------------------------------
+// Atomic operations specialized for mimalloc
+// ------------------------------------------------------

-// Atomically add a 64-bit value; returns the added result.
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add);
+// Atomically add a 64-bit value; returns the previous value. 
+// Note: not using _Atomic(int64_t) as it is only used for statistics.
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);

-// Atomically subtract a value; returns the subtracted result.
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub);
+// Atomically add a value; returns the previous value. Memory ordering is relaxed.
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);

-// Atomically subtract a value; returns the subtracted result.
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub);
+// Atomically compare and exchange a value; returns `true` if successful. 
+// May fail spuriously. Memory ordering as release on success, and relaxed on failure.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);

 // Atomically compare and exchange a value; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare);
+// Memory ordering is acquire-release
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);

-// Atomically compare and exchange a value; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare);
+// Atomically exchange a value. Memory ordering is acquire-release.
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);

-// Atomically exchange a value.
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange);
+// Atomically read a value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);

-// Atomically read a value
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p);
+// Atomically read a value. Memory ordering is acquire.
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p);

-// Atomically write a value
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x);
-
-// Atomically read a pointer
-static inline void* mi_atomic_read_ptr(volatile void** p) {
-  return (void*)mi_atomic_read( (volatile uintptr_t*)p );
-}
+// Atomically write a value. Memory ordering is release.
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);

+// Yield
 static inline void mi_atomic_yield(void);


+
+// Atomically add a value; returns the previous value.
+static inline uintptr_t mi_atomic_addu(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, (intptr_t)add);
+}
+// Atomically subtract a value; returns the previous value.
+static inline uintptr_t mi_atomic_subu(volatile _Atomic(uintptr_t)* p, uintptr_t sub) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, -((intptr_t)sub));
+}
+
+// Atomically increment a value; returns the incremented result.
+static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_addu(p, 1);
+}
+
+// Atomically decrement a value; returns the decremented result.
+static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_subu(p, 1);
+}
+
+// Atomically read a pointer; Memory order is relaxed.
+static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p);
+}
+
+// Atomically read a pointer; Memory order is acquire.
+static inline void* mi_atomic_read_ptr(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read((const volatile _Atomic(uintptr_t)*)p);
+}
+
 // Atomically write a pointer
-static inline void mi_atomic_write_ptr(volatile void** p, void* x) {
-  mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x );
+static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) {
+  mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x );
+}
+
+// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_weak(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
 }

 // Atomically compare and exchange a pointer; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) {
-  return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare);
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_strong(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
 }

 // Atomically exchange a pointer value.
-static inline void* mi_atomic_exchange_ptr(volatile void** p, void* exchange) {
-  return (void*)mi_atomic_exchange((volatile uintptr_t*)p, (uintptr_t)exchange);
+static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exchange) {
+  return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange);
 }

-static inline intptr_t mi_atomic_iread(volatile intptr_t* p) {
-  return (intptr_t)mi_atomic_read( (volatile uintptr_t*)p );
-}

 #ifdef _MSC_VER
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <intrin.h>
-#if (MI_INTPTR_SIZE==8)
+#ifdef _WIN64
 typedef LONG64   msc_intptr_t;
 #define RC64(f)  f##64
 #else
 typedef LONG     msc_intptr_t;
 #define RC64(f)  f
 #endif
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) {
-  return (uintptr_t)RC64(_InterlockedIncrement)((volatile msc_intptr_t*)p);
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
+  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) {
-  return (uint32_t)_InterlockedIncrement((volatile LONG*)p);
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return (expected == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) {
-  return (uintptr_t)RC64(_InterlockedDecrement)((volatile msc_intptr_t*)p);
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return mi_atomic_cas_strong(p,desired,expected);
 }
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) {
-  return (uintptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub)) - sub;
-}
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) {
-  return (uint32_t)_InterlockedExchangeAdd((volatile LONG*)p, -((LONG)sub)) - sub;
-}
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) {
-  return ((int32_t)compare == _InterlockedCompareExchange((volatile LONG*)p, (LONG)exchange, (LONG)compare));
-}
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) {
-  return (compare == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange, (msc_intptr_t)compare));
-}
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
+static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
  return *p;
 }
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
-  *p = x;
+static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+  return mi_atomic_read(p);
+}
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  mi_atomic_exchange(p,x);
 }
 static inline void mi_atomic_yield(void) {
  YieldProcessor();
 }
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
-  #if (MI_INTPTR_SIZE==8)
-  return _InterlockedExchangeAdd64(p, add) + add;
+static inline void mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) {
+  #ifdef _WIN64
+  mi_atomic_add(p,add);
  #else
  int64_t current;
  int64_t sum;
@ -126,62 +160,46 @@ static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
    current = *p;
    sum = current + add;
  } while (_InterlockedCompareExchange64(p, sum, current) != current);
-  return sum;
  #endif
 }

 #else
 #ifdef __cplusplus
-#include <atomic>
 #define  MI_USING_STD   using namespace std;
-#define  _Atomic(tp)    atomic<tp>
 #else
-#include <stdatomic.h>
 #define  MI_USING_STD
 #endif
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) {
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add) {
  MI_USING_STD
-  return atomic_fetch_add_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) + 1;
+  atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
 }
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) {
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
  MI_USING_STD
-  return atomic_fetch_add_explicit((volatile _Atomic(uint32_t)*)p, (uint32_t)1, memory_order_relaxed) + 1;
+  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) {
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) - 1;
+  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
 }
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  MI_USING_STD
-  return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed) + add;
+  return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) {
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
  MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, sub, memory_order_relaxed) - sub;
+  return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
 }
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) {
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) {
  MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile _Atomic(uint32_t)*)p, sub, memory_order_relaxed) - sub;
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
 }
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) {
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) {
  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit((volatile _Atomic(uint32_t)*)p, &compare, exchange, memory_order_release, memory_order_relaxed);
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire);
 }
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) {
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit((volatile atomic_uintptr_t*)p, &compare, exchange, memory_order_release, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) {
-  MI_USING_STD
-  return atomic_exchange_explicit((volatile atomic_uintptr_t*)p, exchange, memory_order_acquire);
-}
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
-  MI_USING_STD
-  return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed);
-}
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed);
+  return atomic_store_explicit(p, x, memory_order_release);
 }

 #if defined(__cplusplus)
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -22,12 +22,13 @@ terms of the MIT license. A copy of the license can be found in the file


 // "options.c"
-void       _mi_fputs(FILE* out, const char* prefix, const char* message);
-void       _mi_fprintf(FILE* out, const char* fmt, ...);
+void       _mi_fputs(mi_output_fun* out, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, const char* fmt, ...);
 void       _mi_error_message(const char* fmt, ...);
 void       _mi_warning_message(const char* fmt, ...);
 void       _mi_verbose_message(const char* fmt, ...);
 void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);

 // "init.c"
 extern mi_stats_t       _mi_stats_main;
@ -45,8 +46,7 @@ void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocat
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data

 // memory.c
-void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* id, mi_os_tld_t* tld);
-void*      _mi_mem_alloc(size_t size, bool commit, size_t* id, mi_os_tld_t* tld);
+void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, size_t* id, mi_os_tld_t* tld);
 void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);

 bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
@ -318,39 +318,24 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
 }


+
 //-----------------------------------------------------------
 // Page flags
 //-----------------------------------------------------------
-static inline uintptr_t mi_page_thread_id(const mi_page_t* page) {
-  return (page->flags & ~MI_PAGE_FLAGS_MASK);
-}
-
-static inline void mi_page_init_flags(mi_page_t* page, uintptr_t thread_id) {
-  mi_assert_internal((thread_id & MI_PAGE_FLAGS_MASK) == 0);
-  page->flags = thread_id;
-}
-
-static inline void mi_page_set_thread_id(mi_page_t* page, uintptr_t thread_id) {
-  mi_assert_internal((thread_id & MI_PAGE_FLAGS_MASK) == 0);
-  page->flags = thread_id | (page->flags & MI_PAGE_FLAGS_MASK);
-}
-
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return ((page->flags & 0x01) != 0);
+  return page->flags.in_full;
 }

 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  if (in_full) page->flags |= 0x01;
-          else page->flags &= ~0x01;
+  page->flags.in_full = in_full;
 }

 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return ((page->flags & 0x02) != 0);
+  return page->flags.has_aligned;
 }

 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  if (has_aligned) page->flags |= 0x02;
-              else page->flags &= ~0x02;
+  page->flags.has_aligned = has_aligned;
 }


--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -10,6 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file

 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
+#include <mimalloc-atomic.h>  // _Atomic

 // ------------------------------------------------------
 // Variants
@ -91,11 +92,13 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
 #define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)

-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)   // 128kb on 64-bit
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)    // 2Mb on 64-bit
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX>>MI_INTPTR_SHIFT)
-#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)  // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16kb
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128kb
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2mb 
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)     
+#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)

 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
@ -124,12 +127,15 @@ typedef enum mi_delayed_e {
 } mi_delayed_t;


-// Use the bottom 2 bits for the `in_full` and `has_aligned` flags
-// and the rest for the threadid (we assume tid's never use those lower 2 bits).
-// This allows a single test in `mi_free` to check for unlikely cases
-// (namely, non-local free, aligned free, or freeing in a full page)
-#define MI_PAGE_FLAGS_MASK  ((uintptr_t)0x03)
-typedef uintptr_t mi_page_flags_t;
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently 
+// test if both are false (`value == 0`) in the `mi_free` routine.
+typedef union mi_page_flags_u {
+  uint16_t value;
+  struct {
+    bool in_full;
+    bool has_aligned;
+  };
+} mi_page_flags_t;

 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
@ -161,19 +167,19 @@ typedef struct mi_page_s {
  bool                  is_committed:1;    // `true` if the page virtual memory is committed

  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
  uint16_t              reserved;          // number of blocks reserved in memory
-                                           // 16 bits padding
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (16 bits)
+
  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
  #if MI_SECURE
  uintptr_t             cookie;            // random cookie to encode the free lists
  #endif
-  mi_page_flags_t       flags;             // threadid:62 | has_aligned:1 | in_full:1
  size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
  
  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  volatile uintptr_t    thread_freed;      // at least this number of blocks are in `thread_free`
-  volatile mi_thread_free_t thread_free;   // list of deferred free blocks freed by other threads
+  volatile _Atomic(uintptr_t)        thread_freed;  // at least this number of blocks are in `thread_free`
+  volatile _Atomic(mi_thread_free_t) thread_free;   // list of deferred free blocks freed by other threads

  // less accessed info
  size_t                block_size;        // size available in each block (always `>0`)
@ -181,12 +187,11 @@ typedef struct mi_page_s {
  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`

-// improve page index calculation
-#if (MI_INTPTR_SIZE==8 && MI_SECURE==0)
-  void*                 padding[1];        // 12 words on 64-bit
-#elif MI_INTPTR_SIZE==4
-  // void*                 padding[1];         // 12 words on 32-bit
-#endif
+  // improve page index calculation
+  // without padding: 10 words on 64-bit, 11 on 32-bit. Secure adds one word
+  #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
+  void*                 padding[1];        // 12 words on 64-bit in secure mode, 12 words on 32-bit plain
+  #endif
 } mi_page_t;


@ -202,20 +207,25 @@ typedef enum mi_page_kind_e {
 // the OS. Inside segments we allocated fixed size _pages_ that
 // contain blocks.
 typedef struct mi_segment_s {
-  struct mi_segment_s* next;
+  // memory fields
+  size_t          memid;            // id for the os-level memory manager
+  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
+  bool            mem_is_committed; // `true` if the whole segment is eagerly committed
+
+  // segment fields
+  struct mi_segment_s* next;   // must be the first segment field -- see `segment.c:segment_alloc`
  struct mi_segment_s* prev;
-  struct mi_segment_s* abandoned_next;
+  volatile _Atomic(struct mi_segment_s*) abandoned_next;
  size_t          abandoned;   // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
  size_t          used;        // count of pages in use (`used <= capacity`)
  size_t          capacity;    // count of available pages (`#free + used`)
  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
  uintptr_t       cookie;      // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
-  size_t          memid;       // id for the os-level memory manager

  // layout like this to optimize access in `mi_free`
  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  volatile uintptr_t thread_id;   // unique id of the thread owning this segment
+  volatile _Atomic(uintptr_t) thread_id;   // unique id of the thread owning this segment
  mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
  mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
@ -251,7 +261,7 @@ struct mi_heap_s {
  mi_tld_t*             tld;
  mi_page_t*            pages_free_direct[MI_SMALL_WSIZE_MAX + 2];   // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
  mi_page_queue_t       pages[MI_BIN_FULL + 1];                      // queue of pages for each size class (or "bin")
-  volatile mi_block_t*  thread_delayed_free;
+  volatile _Atomic(mi_block_t*) thread_delayed_free;
  uintptr_t             thread_id;                                   // thread this heap belongs too
  uintptr_t             cookie;
  uintptr_t             random;                                      // random number used for secure allocation
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -53,8 +53,8 @@ terms of the MIT license. A copy of the license can be found in the file
  #else
  #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
  #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
-  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
  #endif
+  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
 #else
  #define mi_decl_thread              __thread
  #define mi_decl_export
@ -69,8 +69,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // Includes
 // ------------------------------------------------------

+#include <stddef.h>     // size_t
 #include <stdbool.h>    // bool
-#include <stdio.h>      // FILE

 #ifdef __cplusplus
 extern "C" {
@ -107,18 +107,23 @@ mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)
 mi_decl_export size_t mi_usable_size(const void* p)   mi_attr_noexcept;
 mi_decl_export size_t mi_good_size(size_t size)       mi_attr_noexcept;

+typedef void (mi_deferred_free_fun)(bool force, unsigned long long heartbeat);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free) mi_attr_noexcept;
+
+typedef void (mi_output_fun)(const char* msg);
+mi_decl_export void mi_register_output(mi_output_fun* out) mi_attr_noexcept;
+
 mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
-mi_decl_export void mi_stats_print(FILE* out) mi_attr_noexcept;
-mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(mi_output_fun* out) mi_attr_noexcept;

 mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
 mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print(FILE* out) mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept;

-typedef void (mi_deferred_free_fun)(bool force, unsigned long long heartbeat);
-mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free) mi_attr_noexcept;

 // ------------------------------------------------------
 // Aligned allocation
@ -229,9 +234,13 @@ typedef enum mi_option_e {
  mi_option_eager_region_commit,
  mi_option_large_os_pages,         // implies eager commit
  mi_option_reserve_huge_os_pages,
+  mi_option_segment_cache,
  mi_option_page_reset,
  mi_option_cache_reset,
  mi_option_reset_decommits,
+  mi_option_eager_commit_delay,
+  mi_option_segment_reset,
+  mi_option_os_tag,
  _mi_option_last
 } mi_option_t;