merge from dev3

2025-08-24 08:14:48 +03:00 · 2025-03-06 21:06:46 -08:00 · 2025-03-06 21:06:46 -08:00 · 8edce30c17
commit 8edce30c17
parent d96f00af2a 18124909a3
23 changed files with 357 additions and 262 deletions
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@ -64,11 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
  // allocate a fresh arena slice
-  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. (same with _mi_os_numa_node()...)
  mi_memid_t memid;
  uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
                                                                    true /* commit*/, (MI_SECURE==0) /* allow large? */,
-                                                                    NULL /* req arena */, 0 /* thread_seq */, &memid);
+                                                                    NULL /* req arena */, 0 /* thread_seq */, -1 /* numa node */, &memid);
  if (base == NULL) return NULL;
  mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
  if (!memid.initially_zero) {
--- a/src/arena.c
+++ b/src/arena.c
@ -335,12 +335,13 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
  Arena iteration
 ----------------------------------------------------------- */

-static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_pinned) {
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, bool match_numa, int numa_node, bool allow_pinned) {
  if (!allow_pinned && arena->memid.is_pinned) return false;
  if (!mi_arena_id_is_suitable(arena, req_arena)) return false;
  if (req_arena == NULL) { // if not specific, check numa affinity
    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (!numa_suitable) return false;
+    if (match_numa) { if (!numa_suitable) return false; }
+               else { if (numa_suitable)  return false; }
  }
  return true;
 }
@ -375,9 +376,9 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
  } \
  }

-#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \
+#define mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, numa_node, allow_large, name_arena) \
  mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \
-    if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \
+    if (mi_arena_is_suitable(name_arena, req_arena, match_numa, numa_node, allow_large)) { \

 #define mi_forall_suitable_arenas_end() \
  }} \
@ -390,19 +391,28 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
 // allocate slices from the arenas
 static mi_decl_noinline void* mi_arenas_try_find_free(
  mi_subproc_t* subproc, size_t slice_count, size_t alignment,
-  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
+  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
  mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
  if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;

-  // search arena's
-  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
+  // search arena's 
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, true /* only numa matching */, numa_node, allow_large, arena)
  {
    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
    if (p != NULL) return p;
  }
  mi_forall_suitable_arenas_end();
+  if (numa_node < 0) return NULL;
+
+  // search again but now regardless of preferred numa affinity
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, false /* numa non-matching now */, numa_node, allow_large, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_suitable_arenas_end();  
  return NULL;
 }

@ -411,14 +421,14 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
  mi_subproc_t* subproc,
  size_t slice_count, size_t alignment,
  bool commit, bool allow_large,
-  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
  void* p;

  // try to find free slices in the arena's
-  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
  if (p != NULL) return p;

  // did we need a specific arena?
@ -441,7 +451,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
  }
  // try once more to allocate in the new arena
  mi_assert_internal(req_arena == NULL);
-  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
  if (p != NULL) return p;

  return NULL;
@ -472,21 +482,18 @@ static void* mi_arena_os_alloc_aligned(
 void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc,
  size_t size, size_t alignment, size_t align_offset,
  bool commit, bool allow_large,
-  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
  mi_assert_internal(memid != NULL);
  mi_assert_internal(size > 0);

-  // *memid = _mi_memid_none();
-  // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
-
  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&           // is arena allocation allowed?
      size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
      alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
  {
    const size_t slice_count = mi_slice_count_of_size(size);
-    void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
+    void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
    if (p != NULL) return p;
  }

@ -495,9 +502,9 @@ void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc,
  return p;
 }

-void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
+void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
 {
-  return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
+  return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, numa_node, memid);
 }


@ -547,7 +554,9 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_

  // search arena's
  const bool allow_large = true;
-  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
+  const int  any_numa = -1;
+  const bool match_numa = true;  
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, any_numa, allow_large, arena)
  {
    size_t slice_index;
    mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
@ -582,7 +591,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_

 // Allocate a fresh page
 static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
-                                            mi_arena_t* req_arena, size_t tseq, bool commit)
+                                            mi_arena_t* req_arena, size_t tseq, int numa_node, bool commit)
 {
  const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
@ -596,7 +605,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
      !os_align &&                            // not large alignment
      slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
  {
-    page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid);
+    page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, numa_node, &memid);
    if (page != NULL) {
      mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
      mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index);
@ -727,7 +736,7 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
  const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
  const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) ||  // always commit small pages
                       (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0));
-  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit);
+  page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, heap->numa_node, commit);
  if (page != NULL) {
    mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
    _mi_page_init(heap, page);
@ -749,7 +758,7 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s
  const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
  #endif

-  mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, true /* commit singletons always */);
+  mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, heap->numa_node, true /* commit singletons always */);
  if (page == NULL) return NULL;

  mi_assert(page->reserved == 1);
@ -1375,7 +1384,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
  return bit_set_count;
 }

-static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
  _mi_raw_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3);
  const size_t fields_per_line = (narrow ? 2 : 4);
  size_t bit_count = 0;
@ -1391,11 +1400,12 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con

    char chunk_kind = ' ';
    if (chunk_bins != NULL) {
-      switch (mi_atomic_load_relaxed(&chunk_bins[i])) {
+      switch (mi_bbitmap_debug_get_bin(chunk_bins,i)) {
        case MI_BBIN_SMALL:  chunk_kind = 'S'; break;
        case MI_BBIN_MEDIUM: chunk_kind = 'M'; break;
        case MI_BBIN_LARGE:  chunk_kind = 'L'; break;
        case MI_BBIN_OTHER:  chunk_kind = 'X'; break;
+        default: chunk_kind = ' '; break; // suppress warning
        // case MI_BBIN_NONE: chunk_kind = 'N'; break;
      }
    }
@ -1432,7 +1442,7 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con
  return bit_set_count;
 }

-static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
+static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
  return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
 }

@ -1463,7 +1473,7 @@ static void mi_debug_show_arenas_ex(bool show_pages, bool narrow) mi_attr_noexce
      const char* header1 = "pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)";
      const char* header2 = (narrow ? "\n      " : " ");
      const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)";
-      page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena, narrow);
+      page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunkmap_bins, false, arena, narrow);
    }
  }
  // if (show_inuse)     _mi_raw_message("total inuse slices    : %zu\n", slice_total - free_total);
@ -1515,17 +1525,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
  if (pages == 0) return 0;

  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
+  int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) { numa_count = 1; }
  const size_t pages_per = pages / numa_count;
  const size_t pages_mod = pages % numa_count;
  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);

  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+  for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if ((size_t)numa_node < pages_mod) { node_pages++; }
+    int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
    if (err) return err;
    if (pages < node_pages) {
      pages = 0;
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -218,39 +218,39 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all
 // ------- mi_bfield_atomic_is_set ---------------------------------------

 // Check if a bit is set
-static inline bool mi_bfield_atomic_is_set(_Atomic(mi_bfield_t)*b, const size_t idx) {
+static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) {
  const mi_bfield_t x = mi_atomic_load_relaxed(b);
  return ((x & mi_bfield_mask(1,idx)) != 0);
 }

 // Check if a bit is clear
-static inline bool mi_bfield_atomic_is_clear(_Atomic(mi_bfield_t)*b, const size_t idx) {
+static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) {
  const mi_bfield_t x = mi_atomic_load_relaxed(b);
  return ((x & mi_bfield_mask(1, idx)) == 0);
 }

 // Check if a bit is xset
-static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, const size_t idx) {
+static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) {
  if (set) return mi_bfield_atomic_is_set(b, idx);
      else return mi_bfield_atomic_is_clear(b, idx);
 }

 // Check if all bits corresponding to a mask are set.
-static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
  mi_assert_internal(mask != 0);
  const mi_bfield_t x = mi_atomic_load_relaxed(b);
  return ((x & mask) == mask);
 }

 // Check if all bits corresponding to a mask are clear.
-static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
  mi_assert_internal(mask != 0);
  const mi_bfield_t x = mi_atomic_load_relaxed(b);
  return ((x & mask) == 0);
 }

 // Check if all bits corresponding to a mask are set/cleared.
-static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
+static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
  mi_assert_internal(mask != 0);
  if (set) return mi_bfield_atomic_is_set_mask(b, mask);
      else return mi_bfield_atomic_is_clear_mask(b, mask);
@ -371,7 +371,7 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, b

 // Check if a sequence of `n` bits within a chunk are all set/cleared.
 // This can cross bfield's
-mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
+mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
  mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
  while (n > 0) {
    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
@ -391,7 +391,7 @@ mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chu
 }

 // Check if a sequence of `n` bits within a chunk are all set/cleared.
-static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) {
+static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) {
  mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
  mi_assert_internal(n>0);
  if (n==0) return true;
@ -1413,7 +1413,23 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
 // Assign a specific size bin to a chunk
 static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) {
  mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
-  mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin);
+  for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
+    if (ibin == bin) {
+      mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+    }
+    else {
+      mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
+    }
+  }  
+}
+
+mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) {
+  for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
+    if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) {
+      return ibin;
+    }
+  }
+  return MI_BBIN_NONE;
 }

 // Track the index of the highest chunk that is accessed.
@ -1541,56 +1557,65 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
  mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
  const mi_bfield_t cmap_mask  = mi_bfield_mask(cmap_max_count,0);
  const size_t cmap_cycle      = cmap_acc+1;
-  const mi_bbin_t bbin = mi_bbin_of(n);
-  // visit bins from smallest to largest (to reduce fragmentation on the larger blocks)
-  for(mi_bbin_t bin = MI_BBIN_SMALL; bin <= bbin; bin = mi_bbin_inc(bin))  // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
-      // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--)  // visit bins from largest size bin up to the NONE bin
+  const mi_bbin_t bbin = mi_bbin_of(n); 
+  // visit each cmap entry
+  size_t cmap_idx = 0;
+  mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
  {
-    size_t cmap_idx = 0;
-    mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
-    {
-      // don't search into non-accessed memory until we tried other size bins as well
-      if (bin < bbin && cmap_idx > cmap_acc)
-         // (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) // large to small
-      {
-        break;
-      }
+    // and for each chunkmap entry we iterate over its bits to find the chunks
+    const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
+    const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    if (cmap_entry == 0) continue;

-      // and for each chunkmap entry we iterate over its bits to find the chunks
-      const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
-      const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
+    // get size bin masks
+    mi_bfield_t cmap_bins[MI_BBIN_COUNT] = { 0 };
+    cmap_bins[MI_BBIN_NONE] = cmap_entry;
+    for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
+      const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]);
+      cmap_bins[ibin] = cmap_bin & cmap_entry;
+      cmap_bins[MI_BBIN_NONE] &= ~cmap_bin;      // clear bits that are in an assigned size bin
+    }
+
+    // consider only chunks for a particular size bin at a time    
+    // this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple
+    // iterations through all entries.
+    mi_assert_internal(bbin < MI_BBIN_NONE);
+    for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin <= MI_BBIN_NONE;
+          // skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation)
+          ibin = (ibin == bbin ? MI_BBIN_NONE : mi_bbin_inc(ibin)))
+    {
+      mi_assert_internal(ibin < MI_BBIN_COUNT);
+      const mi_bfield_t cmap_bin = cmap_bins[ibin];      
      size_t eidx = 0;
-      mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
+      mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y)  
      {
-        mi_assert_internal(eidx <= MI_BFIELD_BITS);
+        // assertion doesn't quite hold as the max_accessed may be out-of-date
+        // mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_BBIN_NONE);
+
+        // get the chunk 
        const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
-        mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
-        // only in the current size class!
-        const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]);
-        if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run
-           // ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) {  largest to smallest
-        {
-          mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
-          size_t cidx;
-          if ((*on_find)(chunk, n, &cidx)) {
-            if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin
-              // this chunk is now reserved for the `bbin` size class
-              mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
-            }
-            *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-            mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
-            return true;
-          }
-          else {
-            /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
-            mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
+        mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
+
+        size_t cidx;
+        if ((*on_find)(chunk, n, &cidx)) {
+          if (cidx==0 && ibin == MI_BBIN_NONE) { // only the first block determines the size bin
+            // this chunk is now reserved for the `bbin` size class
+            mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
          }
+          *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+          mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
+          return true;
+        }
+        else {
+          // todo: should _on_find_ return a boolen if there is a chance all are clear to avoid calling `try_clear?`
+          // we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation.
+          mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
        }
      }
      mi_bfield_cycle_iterate_end(Y);
    }
-    mi_bfield_cycle_iterate_end(X);
  }
+  mi_bfield_cycle_iterate_end(X);
  return false;
 }

--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -215,18 +215,24 @@ bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* vis
 // Size bins; larger bins are allowed to go into smaller bins.
 // SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
 typedef enum mi_bbin_e {
-  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
  MI_BBIN_SMALL,    // slice_count == 1
  MI_BBIN_OTHER,    // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
  MI_BBIN_MEDIUM,   // slice_count == 8
  MI_BBIN_LARGE,    // slice_count == MI_BFIELD_BITS  -- only used if MI_ENABLE_LARGE_PAGES is 1
+  MI_BBIN_NONE,     // no bin assigned yet (the chunk is completely free)
  MI_BBIN_COUNT
 } mi_bbin_t;

 static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
+  mi_assert_internal(bbin < MI_BBIN_COUNT);
  return (mi_bbin_t)((int)bbin + 1);
 }

+static inline mi_bbin_t mi_bbin_dec(mi_bbin_t bbin) {
+  mi_assert_internal(bbin > MI_BBIN_NONE);
+  return (mi_bbin_t)((int)bbin - 1);
+}
+
 static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
  if (slice_count==1) return MI_BBIN_SMALL;
  if (slice_count==8) return MI_BBIN_MEDIUM;
@ -241,8 +247,8 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
-  mi_bchunkmap_t   chunkmap;
-  _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT];        // 512b
+  mi_bchunkmap_t   chunkmap;                                    
+  mi_bchunkmap_t   chunkmap_bins[MI_BBIN_COUNT - 1];             // chunkmaps with bit set if the chunk is in that size class (excluding MI_BBIN_NONE)  
  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bbitmap_t;

@ -255,6 +261,8 @@ static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
  return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
 }

+mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunk_t* chunkmap_bins, size_t chunk_idx);
+
 size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);


--- a/src/heap.c
+++ b/src/heap.c
@ -182,12 +182,13 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
  mi_memid_t memid = heap->memid;
  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
  heap->memid = memid;
-  heap->tld        = tld;  // avoid reading the thread-local tld during initialization
+  heap->tld   = tld;  // avoid reading the thread-local tld during initialization
+  heap->tag   = heap_tag;
+  heap->numa_node = tld->numa_node;
  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
  heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0);
  heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
  heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
-  heap->tag        = heap_tag;
  if (heap->tld->is_in_threadpool) {
    // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
    // this is checked in `free.c:mi_free_try_collect_mt`
@ -227,7 +228,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
  else {
    // heaps associated wita a specific arena are allocated in that arena
    // note: takes up at least one slice which is quite wasteful...
-    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+    heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, tld->numa_node, &memid);
  }
  if (heap==NULL) {
    _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
@ -261,6 +262,11 @@ uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
  return _mi_random_next(&heap->random);
 }

+void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node) {
+  if (heap == NULL) return;
+  heap->numa_node = (numa_node < 0 ? -1 : numa_node % _mi_os_numa_node_count());
+}
+
 // zero out the page queues
 static void mi_heap_reset_pages(mi_heap_t* heap) {
  mi_assert_internal(heap != NULL);
--- a/src/init.c
+++ b/src/init.c
@ -104,6 +104,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main
 static mi_decl_cache_align mi_tld_t tld_empty = {
  0,                      // thread_id
  0,                      // thread_seq
+  0,                      // default numa node
  &subproc_main,          // subproc
  NULL,                   // heap_backing
  NULL,                   // heaps list
@ -117,6 +118,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  &tld_empty,             // tld
  NULL,                   // exclusive_arena
+  0,                      // preferred numa node
  0,                      // cookie
  //{ 0, 0 },               // keys
  { {0}, {0}, 0, true },  // random
@ -141,6 +143,7 @@ extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main;
 static mi_decl_cache_align mi_tld_t tld_main = {
  0,                      // thread_id
  0,                      // thread_seq
+  0,                      // numa node
  &subproc_main,          // subproc
  &heap_main,             // heap_backing
  &heap_main,             // heaps list
@ -154,6 +157,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
 mi_decl_cache_align mi_heap_t heap_main = {
  &tld_main,              // thread local data
  NULL,                   // exclusive arena
+  0,                      // preferred numa node
  0,                      // initial cookie
  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
  { {0x846ca68b}, {0}, 0, true },  // random
@ -306,6 +310,7 @@ static mi_tld_t* mi_tld_alloc(void) {
    tld->heap_backing = NULL;
    tld->heaps = NULL;
    tld->subproc = &subproc_main;
+    tld->numa_node = _mi_os_numa_node();
    tld->thread_id = _mi_prim_thread_id();
    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
    tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
@ -647,25 +652,52 @@ void _mi_process_load(void) {
  _mi_random_reinit_if_weak(&heap_main.random);
 }

-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include <intrin.h>
+// CPU features
 mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
 mi_decl_cache_align bool _mi_cpu_has_erms = false;
+mi_decl_cache_align bool _mi_cpu_has_popcnt = false;
+
+#if (MI_ARCH_X64 || MI_ARCH_X86)
+#if defined(__GNUC__)
+#include <cpuid.h>
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+  return (__get_cpuid(level, &regs4[0], &regs4[1], &regs4[2], &regs4[3]) == 1);
+}
+
+#elif defined(_MSC_VER)
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+  __cpuid((int32_t*)regs4, (int32_t)level);
+  return true;
+}
+#else
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+  MI_UNUSED(regs4); MI_UNUSED(level);
+  return false;
+}
+#endif

 static void mi_detect_cpu_features(void) {
  // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
  // EMRS for fast enhanced rep movsb/stosb support
-  int32_t cpu_info[4];
-  __cpuid(cpu_info, 7);
-  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
-  _mi_cpu_has_erms = ((cpu_info[2] & (1 << 9)) != 0); // bit 9 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  uint32_t cpu_info[4];
+  if (mi_cpuid(cpu_info, 7)) {
+    _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+    _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+  }
+  if (mi_cpuid(cpu_info, 1)) {
+    _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits>
+  }
 }
+
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing
+  #if MI_ARCH_ARM64
+  _mi_cpu_has_popcnt = true;
+  #endif
 }
 #endif

+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
@ -685,15 +717,6 @@ void mi_process_init(void) mi_attr_noexcept {
  // the following two can potentially allocate (on freeBSD for locks and thread keys)
  mi_subproc_main_init();
  mi_process_setup_auto_thread_done();
-
-  #if MI_DEBUG
-  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
-  #endif
-  _mi_verbose_message("secure level: %d\n", MI_SECURE);
-  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
-  #if MI_TSAN
-  _mi_verbose_message("thread santizer enabled\n");
-  #endif
  mi_thread_init();

  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
--- a/src/libc.c
+++ b/src/libc.c
@ -355,7 +355,6 @@ size_t _mi_clz_generic(size_t x) {

 #endif // bit scan

-#if !MI_HAS_FAST_POPCOUNT

 #if MI_SIZE_SIZE == 4
 #define mi_mask_even_bits32      (0x55555555)
@ -383,7 +382,7 @@ static size_t mi_popcount_generic32(uint32_t x) {
  return mi_byte_sum32(x);
 }

-size_t _mi_popcount_generic(size_t x) {
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
  return mi_popcount_generic32(x);
 }

@ -407,9 +406,8 @@ static size_t mi_popcount_generic64(uint64_t x) {
  return mi_byte_sum64(x);
 }

-size_t _mi_popcount_generic(size_t x) {
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
  return mi_popcount_generic64(x);
 }
 #endif

-#endif // popcount
--- a/src/options.c
+++ b/src/options.c
@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0,   UNINIT, MI_OPTION(max_vabits) },               // max virtual address space bits
  { MI_DEFAULT_PAGEMAP_COMMIT,
         UNINIT, MI_OPTION(pagemap_commit) },           // commit the full pagemap upfront?
-  { 0,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
+  { 1,   UNINIT, MI_OPTION(page_commit_on_demand) },    // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
  { 16,  UNINIT, MI_OPTION(page_reclaim_max) },         // don't reclaim pages if we already own N pages (in that size class)
 };

--- a/src/os.c
+++ b/src/os.c
@ -694,18 +694,19 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) {
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/

-_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+static _Atomic(int)  _mi_numa_node_count; // = 0   // cache the node count

-size_t _mi_os_numa_node_count_get(void) {
-  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
-  if (count <= 0) {
+int _mi_os_numa_node_count(void) {
+  int count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if mi_unlikely(count <= 0) {
    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
-    if (ncount > 0) {
-      count = (size_t)ncount;
+    if (ncount > 0 && ncount < INT_MAX) {
+      count = (int)ncount;
    }
    else {
-      count = _mi_prim_numa_node_count(); // or detect dynamically
-      if (count == 0) count = 1;
+      const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
+      if (n == 0 || n > INT_MAX) { count = 1; }
+                            else { count = (int)n; }
    }
    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
    _mi_verbose_message("using %zd numa regions\n", count);
@ -713,15 +714,24 @@ size_t _mi_os_numa_node_count_get(void) {
  return count;
 }

-int _mi_os_numa_node_get(void) {
-  size_t numa_count = _mi_os_numa_node_count();
+
+static int mi_os_numa_node_get(void) {
+  int numa_count = _mi_os_numa_node_count();
  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
  // never more than the node count and >= 0
-  size_t numa_node = _mi_prim_numa_node();
+  const size_t n = _mi_prim_numa_node();
+  int numa_node = (n < INT_MAX ? (int)n : 0);
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
-  return (int)numa_node;
+  return numa_node;
 }

+int _mi_os_numa_node(void) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return mi_os_numa_node_get();
+}
+
+
+

 /* ----------------------------------------------------------------------------
  Public API
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -62,8 +62,16 @@ terms of the MIT license. A copy of the license can be found in the file
  #include <sys/syscall.h>
 #endif

+#if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED)  // QNX
+#define MADV_DONTNEED  POSIX_MADV_DONTNEED
+#endif
+#if !defined(MADV_FREE) && defined(POSIX_MADV_FREE)  // QNX
+#define MADV_FREE  POSIX_MADV_FREE
+#endif
+
 #define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?

+  
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is
@ -191,6 +199,8 @@ int _mi_prim_free(void* addr, size_t size ) {
 static int unix_madvise(void* addr, size_t size, int advice) {
  #if defined(__sun)
  int res = madvise((caddr_t)addr, size, advice);  // Solaris needs cast (issue #520)
+  #elif defined(__QNX__)
+  int res = posix_madvise(addr, size, advice);
  #else
  int res = madvise(addr, size, advice);
  #endif
--- a/src/stats.c
+++ b/src/stats.c
@ -92,23 +92,23 @@ void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {


 // must be thread safe as it is called from stats_merge
-static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) {
+static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
  if (stat==src) return;
-  if (src->total!=0)   { mi_atomic_addi64_relaxed(&stat->total, src->total); }
-  if (src->current!=0) { mi_atomic_addi64_relaxed(&stat->current, src->current); }
-  // peak scores do really not work across threads ... we use conservative max
-  if (src->peak > stat->peak) {
-    mi_atomic_maxi64_relaxed(&stat->peak, src->peak); // or: mi_atomic_addi64_relaxed( &stat->peak, src->peak);
-  }
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total); 
+  mi_atomic_void_addi64_relaxed(&stat->current, &src->current); 
+  // peak scores do really not work across threads .. we just add them
+  mi_atomic_void_addi64_relaxed( &stat->peak, &src->peak);
+  // or, take the max?
+  // mi_atomic_maxi64_relaxed(&stat->peak, src->peak);
 }

-static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
+static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
  if (stat==src) return;
-  if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); }
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
 }

-#define MI_STAT_COUNT(stat)    mi_stat_count_add(&stats->stat, &src->stat);
-#define MI_STAT_COUNTER(stat)  mi_stat_counter_add(&stats->stat, &src->stat);
+#define MI_STAT_COUNT(stat)    mi_stat_count_add_mt(&stats->stat, &src->stat);
+#define MI_STAT_COUNTER(stat)  mi_stat_counter_add_mt(&stats->stat, &src->stat);

 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
@ -119,11 +119,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {

  #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add(&stats->malloc_bins[i], &src->malloc_bins[i]);
+    mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]);
  }
  #endif
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]);
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
  }
 }

@ -318,8 +318,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
  mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
  mi_stat_count_t total = { 0,0,0 };
-  mi_stat_count_add(&total, &stats->malloc_normal);
-  mi_stat_count_add(&total, &stats->malloc_huge);
+  mi_stat_count_add_mt(&total, &stats->malloc_normal);
+  mi_stat_count_add_mt(&total, &stats->malloc_huge);
  mi_stat_print_ex(&total, "total", 1, out, arg, "");
  #endif
  #if MI_STAT>1