Merge branch 'dev' into kile/stl

2025-07-06 11:34:38 +03:00 · 2020-01-16 15:33:13 -08:00 · 2020-01-16 15:33:13 -08:00 · 97bd204c42
commit 97bd204c42
parent 526bee6843 24f8bcbc8f
34 changed files with 3055 additions and 1497 deletions
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -36,6 +36,13 @@ static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
 // Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);

+// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+
 // Atomically compare and exchange a value; returns `true` if successful. 
 // May fail spuriously. Memory ordering as release on success, and relaxed on failure.
 // (Note: expected and desired are in opposite order from atomic_compare_exchange)
@ -121,22 +128,28 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
-#define RC64(f)  f##64
+#define MI_64(f) f##64
 #else
 typedef LONG     msc_intptr_t;
-#define RC64(f)  f
+#define MI_64(f) f
 #endif
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+  return (intptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  return mi_atomic_cas_strong(p,desired,expected);
 }
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
-  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
 static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
  return *p;
@ -177,6 +190,14 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
  MI_USING_STD
  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_and_explicit(p, x, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_or_explicit(p, x, memory_order_relaxed);
+}
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  MI_USING_STD
  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -10,31 +10,31 @@ terms of the MIT license. A copy of the license can be found in the file

 #include "mimalloc-types.h"

-#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__))
+#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__) || defined(__DragonFly__))
 #define MI_TLS_RECURSE_GUARD
 #endif

 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
-#define mi_trace_message(...)  
+#define mi_trace_message(...)
 #endif

 #if defined(_MSC_VER)
 #define mi_decl_noinline   __declspec(noinline)
-#define mi_attr_noreturn 
+#define mi_attr_noreturn
 #elif defined(__GNUC__) || defined(__clang__)
 #define mi_decl_noinline   __attribute__((noinline))
 #define mi_attr_noreturn   __attribute__((noreturn))
 #else
 #define mi_decl_noinline
-#define mi_attr_noreturn   
+#define mi_attr_noreturn
 #endif


 // "options.c"
-void       _mi_fputs(mi_output_fun* out, const char* prefix, const char* message);
-void       _mi_fprintf(mi_output_fun* out, const char* fmt, ...);
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
 void       _mi_error_message(const char* fmt, ...);
 void       _mi_warning_message(const char* fmt, ...);
 void       _mi_verbose_message(const char* fmt, ...);
@ -42,12 +42,17 @@ void       _mi_trace_message(const char* fmt, ...);
 void       _mi_options_init(void);
 void       _mi_fatal_error(const char* fmt, ...) mi_attr_noreturn;

-// "init.c"
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
 extern mi_stats_t       _mi_stats_main;
 extern const mi_page_t  _mi_page_empty;
 bool       _mi_is_main_thread(void);
-uintptr_t  _mi_random_shuffle(uintptr_t x);
-uintptr_t  _mi_random_init(uintptr_t seed /* can be zero */);
 bool       _mi_preloading();  // true while the C runtime is not ready

 // os.c
@ -59,15 +64,15 @@ size_t     _mi_os_good_alloc_size(size_t size);

 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
-void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
+void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);

-bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
+bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
+bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
 bool       _mi_mem_protect(void* addr, size_t size);
 bool       _mi_mem_unprotect(void* addr, size_t size);

-void        _mi_mem_collect(mi_stats_t* stats);
+void        _mi_mem_collect(mi_os_tld_t* tld);

 // "segment.c"
 mi_page_t* _mi_segment_page_alloc(size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
@ -75,7 +80,7 @@ void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t*
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page

 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
@ -85,8 +90,9 @@ void       _mi_page_unfull(mi_page_t* page);
 void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
 void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
 void       _mi_heap_delayed_free(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);

-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay);
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
 size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
 void       _mi_deferred_free(mi_heap_t* heap, bool force);

@ -100,13 +106,14 @@ uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD i
 // "heap.c"
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
 void       _mi_heap_collect_abandon(mi_heap_t* heap);
-uintptr_t  _mi_heap_random(mi_heap_t* heap);
 void       _mi_heap_set_default_direct(mi_heap_t* heap);

 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
-double     _mi_clock_end(double start);
-double     _mi_clock_start(void);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);

 // "alloc.c"
 void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
@ -143,8 +150,8 @@ bool        _mi_page_is_valid(mi_page_t* page);
  Inlined definitions
 ----------------------------------------------------------- */
 #define UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0) 
-#define UNUSED_RELEASE(x)  
+#if (MI_DEBUG>0)
+#define UNUSED_RELEASE(x)
 #else
 #define UNUSED_RELEASE(x)  UNUSED(x)
 #endif
@ -159,7 +166,6 @@ bool        _mi_page_is_valid(mi_page_t* page);


 // Overflow detecting multiply
-#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
@ -171,6 +177,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
  return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
  *total = count * size;
  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
          && size > 0 && (SIZE_MAX / size) < count);
@ -184,6 +191,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {

 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
  uintptr_t mask = alignment - 1;
  if ((alignment & mask) == 0) {  // power of two?
    return ((sz + mask) & ~mask);
@ -193,6 +201,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
  }
 }

+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(void* p, size_t size) {
  for (size_t i = 0; i < size; i++) {
@ -221,7 +235,7 @@ extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate

 static inline mi_heap_t* mi_get_default_heap(void) {
 #ifdef MI_TLS_RECURSE_GUARD
-  // on some platforms, like macOS, the dynamic loader calls `malloc`
+  // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
  // to initialize thread local data. To avoid recursion, we need to avoid
  // accessing the thread local `_mi_default_heap` until our module is loaded
  // and use the statically allocated main heap until that time.
@ -279,7 +293,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
 static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
  mi_assert_internal(idx < segment->capacity);
  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
@ -294,7 +308,9 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const

 // Quick page start for initialized pages
 static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  return _mi_segment_page_start(segment, page, page->block_size, page_size);
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
+  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
 }

 // Get the page containing the pointer
@ -302,7 +318,40 @@ static inline mi_page_t* _mi_ptr_page(void* p) {
  return _mi_segment_page_of(_mi_ptr_segment(p), p);
 }

+// Get the block size of a page (special cased for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if (mi_likely(bsize < MI_HUGE_BLOCK_SIZE)) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, bsize, &psize, NULL);
+    return psize;
+  }
+}
+
 // Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_read_relaxed(&page->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_read_relaxed(&page->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_write(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
 static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
  return (mi_block_t*)(tf & ~0x03);
 }
@ -322,7 +371,7 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
 // are all blocks in a page freed?
 static inline bool mi_page_all_free(const mi_page_t* page) {
  mi_assert_internal(page != NULL);
-  return (page->used - page->thread_freed == 0);
+  return (page->used == 0);
 }

 // are there immediately available blocks
@ -333,8 +382,8 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
 // are there free blocks in this page?
 static inline bool mi_page_has_free(mi_page_t* page) {
  mi_assert_internal(page != NULL);
-  bool hasfree = (mi_page_immediate_available(page) || page->local_free != NULL || (mi_tf_block(page->thread_free) != NULL));
-  mi_assert_internal(hasfree || page->used - page->thread_freed == page->capacity);
+  bool hasfree = (mi_page_immediate_available(page) || page->local_free != NULL || (mi_page_thread_free(page) != NULL));
+  mi_assert_internal(hasfree || page->used == page->capacity);
  return hasfree;
 }

@ -348,7 +397,7 @@ static inline bool mi_page_all_used(mi_page_t* page) {
 static inline bool mi_page_mostly_used(const mi_page_t* page) {
  if (page==NULL) return true;
  uint16_t frac = page->reserved / 8U;
-  return (page->reserved - page->used + page->thread_freed <= frac);
+  return (page->reserved - page->used <= frac);
 }

 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
@ -377,12 +426,30 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 }


-// -------------------------------------------------------------------
-// Encoding/Decoding the free list next pointers
-// Note: we pass a `null` value to be used as the `NULL` value for the 
-// end of a free list. This is to prevent the cookie itself to ever 
-// be present among user blocks (as `cookie^0==cookie`).
-// -------------------------------------------------------------------
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the 
+free list is mutated. Many hardened allocators xor the next pointer `p` 
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess 
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term, 
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear 
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */

 static inline bool mi_is_in_same_segment(const void* p, const void* q) {
  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
@ -397,52 +464,103 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) {
  return (idxp == idxq);
 }

-static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t cookie ) {
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return ((x << shift) | (x >> (MI_INTPTR_BITS - shift)));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return ((x >> shift) | (x << (MI_INTPTR_BITS - shift)));
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t key1, uintptr_t key2 ) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* b = (mi_block_t*)(block->next ^ cookie);
+  mi_block_t* b = (mi_block_t*)(mi_rotr(block->next - key1, key1) ^ key2);
  if (mi_unlikely((void*)b==null)) { b = NULL; }
  return b;
  #else
-  UNUSED(cookie); UNUSED(null);
+  UNUSED(key1); UNUSED(key2); UNUSED(null);
  return (mi_block_t*)block->next;
  #endif
 }

-static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t cookie) {
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t key1, uintptr_t key2) {
  #ifdef MI_ENCODE_FREELIST
  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
-  block->next = (mi_encoded_t)next ^ cookie;
+  block->next = mi_rotl((uintptr_t)next ^ key2, key1) + key1;
  #else
-  UNUSED(cookie); UNUSED(null);
+  UNUSED(key1); UNUSED(key2); UNUSED(null);
  block->next = (mi_encoded_t)next;
  #endif
 }

 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page,block,page->cookie);
-  // check for free list corruption: is `next` at least in our segment range?
+  mi_block_t* next = mi_block_nextx(page,block,page->key[0],page->key[1]);
+  // check for free list corruption: is `next` at least in the same page?
  // TODO: check if `next` is `page->block_size` aligned?
-  if (next!=NULL && !mi_is_in_same_page(block, next)) {
-    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
+  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
+    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
    next = NULL;
-  }   
+  }
  return next;
  #else
  UNUSED(page);
-  return mi_block_nextx(page,block,0);
+  return mi_block_nextx(page,block,0,0);
  #endif
 }

 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page,block,next, page->cookie);
+  mi_block_set_nextx(page,block,next, page->key[0], page->key[1]);
  #else
  UNUSED(page);
-  mi_block_set_nextx(page,block, next,0);
+  mi_block_set_nextx(page,block, next,0,0);
  #endif
 }

+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern size_t _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(_mi_numa_node_count == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
 // -------------------------------------------------------------------
 // Getting the thread id should be performant
 // as it is called in the fast path of `_mi_free`,
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -46,7 +46,7 @@ terms of the MIT license. A copy of the license can be found in the file

 // Encoded free lists allow detection of corrupted free lists
 // and can detect buffer overflows and double `free`s.
-#if (MI_SECURE>=3 || MI_DEBUG>=1) 
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
 #define MI_ENCODE_FREELIST  1
 #endif

@ -76,6 +76,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif

 #define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)

 #define KiB     ((size_t)1024)
 #define MiB     (KiB*KiB)
@ -93,12 +94,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb

 // Derived constants
-#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)

-#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)

 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
@ -108,8 +109,8 @@ terms of the MIT license. A copy of the license can be found in the file
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
 #define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16kb
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128kb
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2mb 
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)     
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2mb
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 #define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)

 // Minimal alignment necessary. On most platforms 16 bytes are needed
@ -123,6 +124,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "define more bins"
 #endif

+// Used as a special value to encode block sizes in 32 bits.
+#define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
+
 // The free lists use encoded next fields
 // (Only actually encodes when MI_ENCODED_FREELIST is defined.)
 typedef uintptr_t mi_encoded_t;
@ -135,21 +139,21 @@ typedef struct mi_block_s {

 // The delayed flags are used for efficient multi-threaded free-ing
 typedef enum mi_delayed_e {
-  MI_NO_DELAYED_FREE = 0,
-  MI_USE_DELAYED_FREE = 1,
-  MI_DELAYED_FREEING = 2,
-  MI_NEVER_DELAYED_FREE = 3
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
 } mi_delayed_t;


-// The `in_full` and `has_aligned` page flags are put in a union to efficiently 
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
 // test if both are false (`full_aligned == 0`) in the `mi_free` routine.
 typedef union mi_page_flags_s {
  uint8_t full_aligned;
  struct {
    uint8_t in_full : 1;
    uint8_t has_aligned : 1;
-  } x; 
+  } x;
 } mi_page_flags_t;

 // Thread free list.
@ -166,14 +170,28 @@ typedef uintptr_t mi_thread_free_t;
 // implement a monotonic heartbeat. The `thread_free` list is needed for
 // avoiding atomic operations in the common case.
 //
-// `used - thread_freed` == actual blocks that are in use (alive)
-// `used - thread_freed + |free| + |local_free| == capacity`
 //
-// note: we don't count `freed` (as |free|) instead of `used` to reduce
-//       the number of memory accesses in the `mi_page_all_free` function(s).
-// note: the funny layout here is due to:
-// - access is optimized for `mi_free` and `mi_page_alloc`
-// - using `uint16_t` does not seem to slow things down
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes: 
+// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 8 words on 64-bit which helps the page index calculations
+//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10 
+//    and 12 are still good for address calculation)
+// - To limit the structure size, the `xblock_size` is 32-bits only; for 
+//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to 
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
  // "owned" by the segment
  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
@ -181,34 +199,27 @@ typedef struct mi_page_s {
  uint8_t               is_reset:1;        // `true` if the page memory was reset
  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
  uint8_t               is_zero_init:1;    // `true` if the page was zero initialized
-  
+
  // layout like this to optimize access in `mi_malloc` and `mi_free`
  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
  uint16_t              reserved;          // number of blocks reserved in memory
  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  bool                  is_zero;           // `true` if the blocks in the free list are zero initialized
+  uint8_t               is_zero:1;         // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire:7;   // expiration count for retired blocks

  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
  #ifdef MI_ENCODE_FREELIST
-  uintptr_t             cookie;            // random cookie to encode the free lists
+  uintptr_t             key[2];            // two random keys to encode the free lists (see `_mi_block_next`)
  #endif
-  size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
-  
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  volatile _Atomic(uintptr_t)        thread_freed;  // at least this number of blocks are in `thread_free`
-  volatile _Atomic(mi_thread_free_t) thread_free;   // list of deferred free blocks freed by other threads
+  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              xblock_size;       // size available in each block (always `>0`) 

-  // less accessed info
-  size_t                block_size;        // size available in each block (always `>0`)
-  mi_heap_t*            heap;              // the owning heap
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  volatile _Atomic(mi_thread_free_t) xthread_free;   // list of deferred free blocks freed by other threads
+  volatile _Atomic(uintptr_t)        xheap;
+  
  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
-
-  // improve page index calculation
-  // without padding: 10 words on 64-bit, 11 on 32-bit. Secure adds one word
-  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
-  void*                 padding[1];        // 12 words on 64-bit with cookie, 12 words on 32-bit plain
-  #endif
 } mi_page_t;


@ -226,19 +237,19 @@ typedef enum mi_page_kind_e {
 typedef struct mi_segment_s {
  // memory fields
  size_t          memid;            // id for the os-level memory manager
-  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
+  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
  bool            mem_is_committed; // `true` if the whole segment is eagerly committed

  // segment fields
  struct mi_segment_s* next;   // must be the first segment field -- see `segment.c:segment_alloc`
  struct mi_segment_s* prev;
-  volatile _Atomic(struct mi_segment_s*) abandoned_next;
+  struct mi_segment_s* abandoned_next;
  size_t          abandoned;   // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
  size_t          used;        // count of pages in use (`used <= capacity`)
  size_t          capacity;    // count of available pages (`#free + used`)
  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t       cookie;      // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+  uintptr_t       cookie;      // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`

  // layout like this to optimize access in `mi_free`
  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
@ -273,6 +284,14 @@ typedef struct mi_page_queue_s {

 #define MI_BIN_FULL  (MI_BIN_HUGE+1)

+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+} mi_random_ctx_t;
+
+
 // A heap owns a set of pages.
 struct mi_heap_s {
  mi_tld_t*             tld;
@ -280,8 +299,9 @@ struct mi_heap_s {
  mi_page_queue_t       pages[MI_BIN_FULL + 1];                      // queue of pages for each size class (or "bin")
  volatile _Atomic(mi_block_t*) thread_delayed_free;
  uintptr_t             thread_id;                                   // thread this heap belongs too
-  uintptr_t             cookie;
-  uintptr_t             random;                                      // random number used for secure allocation
+  uintptr_t             cookie;                                      // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             key[2];                                      // twb random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                                      // random number context used for secure allocation
  size_t                page_count;                                  // total number of pages in the `pages` queues.
  bool                  no_reclaim;                                  // `true` if this heap should not reclaim abandoned pages
 };
@ -384,22 +404,29 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)

-
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------

+typedef int64_t  mi_msecs_t;
+
 // Queue of segments
 typedef struct mi_segment_queue_s {
  mi_segment_t* first;
  mi_segment_t* last;
 } mi_segment_queue_t;

+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;

 // Segments thread local data
 typedef struct mi_segments_tld_s {
  mi_segment_queue_t  small_free;   // queue of segments with free small pages
  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
+  mi_page_queue_t     pages_reset;  // queue of freed pages that can be reset
  size_t              count;        // current number of segments;
  size_t              peak_count;   // peak number of segments
  size_t              current_size; // current size of all segments
@ -408,14 +435,9 @@ typedef struct mi_segments_tld_s {
  size_t              cache_size;   // total size of all segments in the cache
  mi_segment_t*       cache;        // (small) cache of segments
  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;

-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t              region_idx;   // start point for next allocation
-  mi_stats_t*         stats;        // points to tld stats
-} mi_os_tld_t;
-
 // Thread local data
 struct mi_tld_s {
  unsigned long long  heartbeat;     // monotonic heartbeat count
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 120   // major + 2 digits minor
+#define MI_MALLOC_VERSION 140   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -110,22 +110,23 @@ mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)
 mi_decl_export size_t mi_usable_size(const void* p)   mi_attr_noexcept;
 mi_decl_export size_t mi_good_size(size_t size)       mi_attr_noexcept;

-typedef void (mi_deferred_free_fun)(bool force, unsigned long long heartbeat);
-mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free) mi_attr_noexcept;
+typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept;

-typedef void (mi_output_fun)(const char* msg);
-mi_decl_export void mi_register_output(mi_output_fun* out) mi_attr_noexcept;
+typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg);
+mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept;

 mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
 mi_decl_export int  mi_version(void)          mi_attr_noexcept;
 mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
 mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
-mi_decl_export void mi_stats_print(mi_output_fun* out) mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;

 mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
 mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
-mi_decl_export void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;


 // -------------------------------------------------------------------------------------
@ -230,9 +231,14 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b

 // Experimental
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;

+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -264,17 +270,20 @@ typedef enum mi_option_e {
  // the following options are experimental
  mi_option_eager_commit,
  mi_option_eager_region_commit,
+  mi_option_reset_decommits,
  mi_option_large_os_pages,         // implies eager commit
  mi_option_reserve_huge_os_pages,
  mi_option_segment_cache,
  mi_option_page_reset,
-  mi_option_cache_reset,
-  mi_option_reset_decommits,
-  mi_option_eager_commit_delay,
+  mi_option_abandoned_page_reset,
  mi_option_segment_reset,
+  mi_option_eager_commit_delay,
+  mi_option_reset_delay,
+  mi_option_use_numa_nodes,
  mi_option_os_tag,
  mi_option_max_errors,
-  _mi_option_last
+  _mi_option_last,
+  mi_option_eager_page_commit = mi_option_eager_commit
 } mi_option_t;