merge from dev

2025-08-25 00:34:48 +03:00 · 2020-01-20 19:06:08 -08:00 · 2020-01-20 19:06:08 -08:00 · 394a7a92ab
commit 394a7a92ab
parent 88b141cf1f 5bc1c52ae6
92 changed files with 2994 additions and 1010 deletions
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -79,7 +79,7 @@ mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, siz

 mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(count, size, &total)) return NULL;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
 }

@ -168,13 +168,13 @@ mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_

 mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(newcount, size, &total)) return NULL;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
 }

 mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(newcount, size, &total)) return NULL;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
 }

--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -98,7 +98,7 @@ terms of the MIT license. A copy of the license can be found in the file
  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
  #endif

-  #if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+  #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
--- a/src/alloc.c
+++ b/src/alloc.c
@ -22,7 +22,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
 extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
-  mi_assert_internal(page->block_size==0||page->block_size >= size);
+  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
  mi_block_t* block = page->free;
  if (mi_unlikely(block == NULL)) {
    return _mi_malloc_generic(heap, size); // slow path
@ -92,18 +92,18 @@ extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
  // note: we need to initialize the whole block to zero, not just size
  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  UNUSED(size);
+  UNUSED_RELEASE(size);
  mi_assert_internal(p != NULL);
-  mi_assert_internal(size > 0 && page->block_size >= size);
+  mi_assert_internal(mi_page_block_size(page) >= size); // size can be zero
  mi_assert_internal(_mi_ptr_page(p)==page);
  if (page->is_zero) {
    // already zero initialized memory?
    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p,page->block_size));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page)));
  }
  else {
    // otherwise memset
-    memset(p, 0, page->block_size);
+    memset(p, 0, mi_page_block_size(page));
  }
 }

@ -142,12 +142,11 @@ static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, cons
 static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
  // The decoded value is in the same page (or NULL).
  // Walk the free lists to verify positively if it is already freed
-  mi_thread_free_t tf = (mi_thread_free_t)mi_atomic_read_relaxed(mi_atomic_cast(uintptr_t, &page->thread_free));
  if (mi_list_contains(page, page->free, block) ||
      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_tf_block(tf), block))
+      mi_list_contains(page, mi_page_thread_free(page), block))
  {
-    _mi_fatal_error("double free detected of block %p with size %zu\n", block, page->block_size);
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
    return true;
  }
  return false;
@ -177,39 +176,50 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 // Free
 // ------------------------------------------------------

+// free huge block from another thread
+static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately
+  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_get_default_heap();
+  // paranoia: if this it the last reference, the cas should always succeed
+  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_tld_t* tld = heap->tld;
+    const size_t bsize = mi_page_block_size(page);
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&tld->stats.large, bsize);
+    }
+    else {
+      _mi_stat_decrease(&tld->stats.huge, bsize);
+    }
+    _mi_segment_page_free(page, true, &tld->segments);
+  }
+}
+
 // multi-threaded free
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
-  mi_thread_free_t tfree;
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-
+  // huge page segments are always abandoned and can be freed immediately
  mi_segment_t* segment = _mi_page_segment(page);
  if (segment->kind==MI_SEGMENT_HUGE) {
-    // huge page segments are always abandoned and can be freed immediately
-    mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
-    mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&segment->abandoned_next))==NULL);
-    // claim it and free
-    mi_heap_t* heap = mi_get_default_heap();
-    // paranoia: if this it the last reference, the cas should always succeed
-    if (mi_atomic_cas_strong(&segment->thread_id,heap->thread_id,0)) {
-      mi_block_set_next(page, block, page->free);
-      page->free = block;
-      page->used--;
-      page->is_zero = false;
-      mi_assert(page->used == 0);
-      mi_tld_t* tld = heap->tld;
-      _mi_stat_decrease(&tld->stats.huge, page->block_size);
-      _mi_segment_page_free(page,true,&tld->segments);
-    }
+    mi_free_huge_block_mt(segment, page, block);
    return;
  }

+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+  bool use_delayed;
  do {
-    tfree = page->thread_free;
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE ||
-                   (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == mi_atomic_read_relaxed(&page->thread_freed)+1)  // data-race but ok, just optimizes early release of the page
-                  );
+    tfree = mi_atomic_read_relaxed(&page->xthread_free);
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
    if (mi_unlikely(use_delayed)) {
      // unlikely: this only happens on the first concurrent free in a page that is in the full list
      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
@ -219,15 +229,11 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
      mi_block_set_next(page, block, mi_tf_block(tfree));
      tfreex = mi_tf_set_block(tfree,block);
    }
-  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));

-  if (mi_likely(!use_delayed)) {
-    // increment the thread free count and return
-    mi_atomic_increment(&page->thread_freed);
-  }
-  else {
+  if (mi_unlikely(use_delayed)) {
    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+    mi_heap_t* heap = mi_page_heap(page);
    mi_assert_internal(heap != NULL);
    if (heap != NULL) {
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
@ -240,10 +246,10 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc

    // and reset the MI_DELAYED_FREEING flag
    do {
-      tfreex = tfree = page->thread_free;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_NEVER_DELAYED_FREE || mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      if (mi_tf_delayed(tfree) != MI_NEVER_DELAYED_FREE) tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+      tfreex = tfree = mi_atomic_read_relaxed(&page->xthread_free);
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
  }
 }

@ -252,7 +258,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
  #if (MI_DEBUG)
-  memset(block, MI_DEBUG_FREED, page->block_size);
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
  #endif

  // and push it on the free list
@ -279,7 +285,7 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
  mi_assert_internal(page!=NULL && p!=NULL);
  size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  size_t adjust = (diff % page->block_size);
+  size_t adjust = (diff % mi_page_block_size(page));
  return (mi_block_t*)((uintptr_t)p - adjust);
 }

@ -294,7 +300,7 @@ void mi_free(void* p) mi_attr_noexcept
 {
 #if (MI_DEBUG>0)
  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
-    _mi_error_message("trying to free an invalid (unaligned) pointer: %p\n", p);
+    _mi_error_message(EINVAL, "trying to free an invalid (unaligned) pointer: %p\n", p);
    return;
  }
 #endif
@ -304,16 +310,16 @@ void mi_free(void* p) mi_attr_noexcept

 #if (MI_DEBUG!=0)
  if (mi_unlikely(!mi_is_in_heap_region(p))) {
-    _mi_warning_message("possibly trying to free a pointer that does not point to a valid heap region: 0x%p\n"
+    _mi_warning_message("possibly trying to free a pointer that does not point to a valid heap region: %p\n"
      "(this may still be a valid very large allocation (over 64MiB))\n", p);
    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
-      _mi_warning_message("(yes, the previous pointer 0x%p was valid after all)\n", p);
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
    }
  }
 #endif
 #if (MI_DEBUG!=0 || MI_SECURE>=4)
  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message("trying to free a pointer that does not point to a valid heap space: %p\n", p);
+    _mi_error_message(EINVAL, "trying to free a pointer that does not point to a valid heap space: %p\n", p);
    return;
  }
 #endif
@ -324,8 +330,8 @@ void mi_free(void* p) mi_attr_noexcept
 #if (MI_STAT>1)
  mi_heap_t* heap = mi_heap_get_default();
  mi_heap_stat_decrease(heap, malloc, mi_usable_size(p));
-  if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal[_mi_bin(page->block_size)], 1);
+  if (page->xblock_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal[_mi_bin(page->xblock_size)], 1);
  }
  // huge page stat is accounted for in `_mi_page_retire`
 #endif
@ -337,7 +343,9 @@ void mi_free(void* p) mi_attr_noexcept
    mi_block_set_next(page, block, page->local_free);
    page->local_free = block;
    page->used--;
-    if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }
+    if (mi_unlikely(mi_page_all_free(page))) {
+      _mi_page_retire(page);
+    }
  }
  else {
    // non-local, aligned blocks, or a full page; use the more generic path
@ -351,13 +359,19 @@ bool _mi_free_delayed_block(mi_block_t* block) {
  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
  mi_assert_internal(_mi_thread_id() == segment->thread_id);
  mi_page_t* page = _mi_segment_page_of(segment, block);
-  if (mi_tf_delayed(page->thread_free) == MI_DELAYED_FREEING) {
-    // we might already start delayed freeing while another thread has not yet
-    // reset the delayed_freeing flag; in that case don't free it quite yet if
-    // this is the last block remaining.
-    if (page->used - page->thread_freed == 1) return false;
-  }
-  _mi_free_block(page,true,block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */);
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  _mi_free_block(page, true, block);
  return true;
 }

@ -366,7 +380,7 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
  if (p==NULL) return 0;
  const mi_segment_t* segment = _mi_ptr_segment(p);
  const mi_page_t* page = _mi_segment_page_of(segment,p);
-  size_t size = page->block_size;
+  size_t size = mi_page_block_size(page);
  if (mi_unlikely(mi_page_has_aligned(page))) {
    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
@ -418,7 +432,7 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {

 extern inline mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(count,size,&total)) return NULL;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
  return mi_heap_zalloc(heap,total);
 }

@ -429,7 +443,7 @@ mi_decl_allocator void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
 // Uninitialized `calloc`
 extern mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(count,size,&total)) return NULL;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_malloc(heap, total);
 }

@ -470,7 +484,7 @@ mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize

 mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(count, size, &total)) return NULL;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_realloc(heap, p, total);
 }

@ -488,7 +502,7 @@ mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsiz

 mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
-  if (mi_mul_overflow(count, size, &total)) return NULL;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_rezalloc(heap, p, total);
 }

@ -556,7 +570,6 @@ char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #define PATH_MAX MAX_PATH
 #endif
 #include <windows.h>
-#include <errno.h>
 char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
  // todo: use GetFullPathNameW to allow longer file names
  char buf[PATH_MAX];
@ -631,10 +644,6 @@ static bool mi_try_new_handler(bool nothrow) {
  }
 }
 #else
-#include <errno.h>
-#ifndef ENOMEM
-#define ENOMEM 12
-#endif
 typedef void (*std_new_handler_t)();

 #if (defined(__GNUC__) || defined(__clang__))
@ -654,7 +663,7 @@ std_new_handler_t mi_get_new_handler() {
 static bool mi_try_new_handler(bool nothrow) {
  std_new_handler_t h = mi_get_new_handler();
  if (h==NULL) {
-    if (!nothrow) exit(ENOMEM);
+    if (!nothrow) exit(ENOMEM);  // cannot throw in plain C, use exit as we are out of memory anyway.
    return false;
  }
  else {
@ -664,36 +673,70 @@ static bool mi_try_new_handler(bool nothrow) {
 }
 #endif

-static mi_decl_noinline void* mi_try_new(size_t n, bool nothrow ) {
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
  void* p = NULL;
  while(p == NULL && mi_try_new_handler(nothrow)) {
-    p = mi_malloc(n);
+    p = mi_malloc(size);
  }
  return p;
 }

-void* mi_new(size_t n) {
-  void* p = mi_malloc(n);
-  if (mi_unlikely(p == NULL)) return mi_try_new(n,false);
+void* mi_new(size_t size) {
+  void* p = mi_malloc(size);
+  if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
  return p;
 }

-void* mi_new_aligned(size_t n, size_t alignment) {
+void* mi_new_nothrow(size_t size) {
+  void* p = mi_malloc(size);
+  if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
+  return p;
+}
+
+void* mi_new_aligned(size_t size, size_t alignment) {
  void* p;
-  do { p = mi_malloc_aligned(n, alignment); }
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
  while(p == NULL && mi_try_new_handler(false));
  return p;
 }

-void* mi_new_nothrow(size_t n) {
-  void* p = mi_malloc(n);
-  if (mi_unlikely(p == NULL)) return mi_try_new(n,true);
+void* mi_new_aligned_nothrow(size_t size, size_t alignment) {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(true));
  return p;
 }

-void* mi_new_aligned_nothrow(size_t n, size_t alignment) {
-  void* p;
-  do { p = mi_malloc_aligned(n, alignment); }
-  while (p == NULL && mi_try_new_handler(true));
-  return p;
+void* mi_new_n(size_t count, size_t size) {
+  size_t total;
+  if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new(total);
+  }
+}
+
+void* mi_new_realloc(void* p, size_t newsize) {
+  void* q;
+  do {
+    q = mi_realloc(p, newsize);
+  } while (q == NULL && mi_try_new_handler(false));
+  return q;
+}
+
+void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+  size_t total;
+  if (mi_unlikely(mi_count_size_overflow(newcount, size, &total))) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new_realloc(p, total);
+  }
 }
--- a/src/arena.c
+++ b/src/arena.c
@ -381,18 +381,18 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool
    mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
    mi_assert_internal(arena != NULL);
    if (arena == NULL) {
-      _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
    const size_t blocks = mi_block_count_of_size(size);
    bool ones = mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
    if (!ones) {
-      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
+      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
      return;
    };
  }
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -113,7 +113,7 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f
  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
  mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);

-  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
  if ((field & mask) == 0) { // free?
    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
      // claimed!
@ -221,7 +221,7 @@ static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_field
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
  return ((field & mask) == mask);
 }
--- a/src/heap.c
+++ b/src/heap.c
@ -34,7 +34,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
    mi_page_t* page = pq->first;
    while(page != NULL) {
      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
-      mi_assert_internal(page->heap == heap);
+      mi_assert_internal(mi_page_heap(page) == heap);
      count++;
      if (!fn(heap, pq, page, arg1, arg2)) return false;
      page = next; // and continue
@ -50,13 +50,14 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  UNUSED(arg1);
  UNUSED(arg2);
  UNUSED(pq);
-  mi_assert_internal(page->heap == heap);
+  mi_assert_internal(mi_page_heap(page) == heap);
  mi_segment_t* segment = _mi_page_segment(page);
  mi_assert_internal(segment->thread_id == heap->thread_id);
  mi_assert_expensive(_mi_page_is_valid(page));
  return true;
 }
-
+#endif
+#if MI_DEBUG>=3
 static bool mi_heap_is_valid(mi_heap_t* heap) {
  mi_assert_internal(heap!=NULL);
  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
@ -111,20 +112,25 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
  if (!mi_heap_is_initialized(heap)) return;
  _mi_deferred_free(heap, collect > NORMAL);
-  
+
  // collect (some) abandoned pages
  if (collect >= NORMAL && !heap->no_reclaim) {
    if (collect == NORMAL) {
      // this may free some segments (but also take ownership of abandoned pages)
      _mi_segment_try_reclaim_abandoned(heap, false, &heap->tld->segments);
    }
-    #if MI_DEBUG
-    else if (collect == ABANDON && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+    else if (
+              #ifdef NDEBUG
+              collect == FORCE
+              #else
+              collect >= FORCE
+              #endif
+              && _mi_is_main_thread() && mi_heap_is_backing(heap))
+    {
      // the main thread is abandoned, try to free all abandoned segments.
      // if all memory is freed by now, all segments should be freed.
      _mi_segment_try_reclaim_abandoned(heap, true, &heap->tld->segments);
    }
-    #endif
  }

  // if abandoning, mark all pages to no longer add to delayed_free
@ -193,7 +199,7 @@ mi_heap_t* mi_heap_new(void) {
  heap->tld = bheap->tld;
  heap->thread_id = _mi_thread_id();
  _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie = _mi_heap_random_next(heap) | 1;  
+  heap->cookie = _mi_heap_random_next(heap) | 1;
  heap->key[0] = _mi_heap_random_next(heap);
  heap->key[1] = _mi_heap_random_next(heap);
  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
@ -242,28 +248,30 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  UNUSED(pq);

  // ensure no more thread_delayed_free will be added
-  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);  
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);

  // stats
-  if (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX) {
-    if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-      _mi_stat_decrease(&heap->tld->stats.large,page->block_size);
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&heap->tld->stats.large,bsize);
    }
    else {
-      _mi_stat_decrease(&heap->tld->stats.huge, page->block_size);
+      _mi_stat_decrease(&heap->tld->stats.huge, bsize);
    }
  }
-  #if (MI_STAT>1)
-  size_t inuse = page->used - page->thread_freed;
-  if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX)  {
-    mi_heap_stat_decrease(heap,normal[_mi_bin(page->block_size)], inuse);
+#if (MI_STAT>1)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], inuse);
  }
-  mi_heap_stat_decrease(heap,malloc, page->block_size * inuse);  // todo: off for aligned blocks...
-  #endif
+  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
+#endif

-  // pretend it is all free now
-  mi_assert_internal(page->thread_freed<=0xFFFF);
-  page->used = (uint16_t)page->thread_freed;
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;

  // and free the page
  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
@ -355,7 +363,7 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
  mi_assert(mi_heap_is_initialized(heap));
  if (!mi_heap_is_initialized(heap)) return NULL;
  mi_assert_expensive(mi_heap_is_valid(heap));
-  mi_heap_t* old = mi_get_default_heap(); 
+  mi_heap_t* old = mi_get_default_heap();
  _mi_heap_set_default_direct(heap);
  return old;
 }
@ -374,7 +382,7 @@ static mi_heap_t* mi_heap_of_block(const void* p) {
  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
  mi_assert_internal(valid);
  if (mi_unlikely(!valid)) return NULL;
-  return _mi_segment_page_of(segment,p)->heap;
+  return mi_page_heap(_mi_segment_page_of(segment,p));
 }

 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
@ -390,7 +398,7 @@ static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
  bool* found = (bool*)vfound;
  mi_segment_t* segment = _mi_page_segment(page);
  void* start = _mi_page_start(segment, page, NULL);
-  void* end   = (uint8_t*)start + (page->capacity * page->block_size);
+  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
  *found = (p >= start && p < end);
  return (!*found); // continue if not found
 }
@ -432,13 +440,14 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
  mi_assert_internal(page->local_free == NULL);
  if (page->used == 0) return true;

+  const size_t bsize = mi_page_block_size(page);
  size_t   psize;
  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);

  if (page->capacity == 1) {
    // optimize page with one block
    mi_assert_internal(page->used == 1 && page->free == NULL);
-    return visitor(page->heap, area, pstart, page->block_size, arg);
+    return visitor(mi_page_heap(page), area, pstart, bsize, arg);
  }

  // create a bitmap of free blocks.
@ -451,8 +460,8 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
    free_count++;
    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
    size_t offset = (uint8_t*)block - pstart;
-    mi_assert_internal(offset % page->block_size == 0);
-    size_t blockidx = offset / page->block_size;  // Todo: avoid division?
+    mi_assert_internal(offset % bsize == 0);
+    size_t blockidx = offset / bsize;  // Todo: avoid division?
    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
    size_t bitidx = (blockidx / sizeof(uintptr_t));
    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
@ -471,8 +480,8 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
    }
    else if ((m & ((uintptr_t)1 << bit)) == 0) {
      used_count++;
-      uint8_t* block = pstart + (i * page->block_size);
-      if (!visitor(page->heap, area, block, page->block_size, arg)) return false;
+      uint8_t* block = pstart + (i * bsize);
+      if (!visitor(mi_page_heap(page), area, block, bsize, arg)) return false;
    }
  }
  mi_assert_internal(page->used == used_count);
@ -487,12 +496,13 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
  UNUSED(pq);
  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
  mi_heap_area_ex_t xarea;
+  const size_t bsize = mi_page_block_size(page);
  xarea.page = page;
-  xarea.area.reserved = page->reserved * page->block_size;
-  xarea.area.committed = page->capacity * page->block_size;
+  xarea.area.reserved = page->reserved * bsize;
+  xarea.area.committed = page->capacity * bsize;
  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
-  xarea.area.used  = page->used - page->thread_freed; // race is ok
-  xarea.area.block_size = page->block_size;
+  xarea.area.used = page->used;
+  xarea.area.block_size = bsize;
  return fun(heap, &xarea, arg);
 }

--- a/src/init.c
+++ b/src/init.c
@ -23,12 +23,11 @@ const mi_page_t _mi_page_empty = {
  { 0, 0 },
  #endif
  0,       // used
-  NULL,
-  ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
-  0, NULL, NULL, NULL
-  #if (MI_INTPTR_SIZE==8)
-  , { NULL } // padding
-  #endif
+  0,       // xblock_size
+  NULL,    // local_free
+  ATOMIC_VAR_INIT(0), // xthread_free
+  ATOMIC_VAR_INIT(0), // xheap
+  NULL, NULL
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@ -132,7 +131,7 @@ static mi_tld_t tld_main = {
  &_mi_heap_main,
  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
  { 0, tld_main_stats },  // os
-  { MI_STATS_NULL }             // stats
+  { MI_STATS_NULL }       // stats
 };

 #if MI_INTPTR_SIZE==8
@ -145,7 +144,7 @@ mi_heap_t _mi_heap_main = {
  &tld_main,
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY,
-  NULL,
+  ATOMIC_VAR_INIT(NULL),
  0,                // thread id
  MI_INIT_COOKIE,   // initial cookie
  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
@ -180,7 +179,7 @@ static bool _mi_heap_init(void) {
    // use `_mi_os_alloc` to allocate directly from the OS
    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
    if (td == NULL) {
-      _mi_error_message("failed to allocate thread local heap memory\n");
+      _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
      return false;
    }
    mi_tld_t*  tld = &td->tld;
@ -415,7 +414,7 @@ static void mi_process_load(void) {
  const char* msg = NULL;
  mi_allocator_init(&msg);
  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
-    _mi_fputs(NULL,NULL,msg);
+    _mi_fputs(NULL,NULL,NULL,msg);
  }
 }

--- a/src/memory.c
+++ b/src/memory.c
@ -80,7 +80,7 @@ typedef union mi_region_info_u {
    bool  valid;
    bool  is_large;
    short numa_node;
-  };
+  } x;
 } mi_region_info_t;


@ -204,9 +204,9 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,

  // and share it 
  mi_region_info_t info;
-  info.valid = true;
-  info.is_large = region_large;
-  info.numa_node = _mi_os_numa_node(tld);
+  info.x.valid = true;
+  info.x.is_large = region_large;
+  info.x.numa_node = (short)_mi_os_numa_node(tld);
  mi_atomic_write(&r->info, info.value); // now make it available to others
  *region = r;
  return true;
@ -224,12 +224,12 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo

  // numa correct
  if (numa_node >= 0) {  // use negative numa node to always succeed
-    int rnode = info.numa_node;
+    int rnode = info.x.numa_node;
    if (rnode >= 0 && rnode != numa_node) return false;
  }

  // check allow-large
-  if (!allow_large && info.is_large) return false;
+  if (!allow_large && info.x.is_large) return false;

  return true;
 }
@ -278,11 +278,11 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
  mi_region_info_t info;
  info.value = mi_atomic_read(&region->info);
  void* start = mi_atomic_read_ptr(&region->start);
-  mi_assert_internal(!(info.is_large && !*is_large));
+  mi_assert_internal(!(info.x.is_large && !*is_large));
  mi_assert_internal(start != NULL);

  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
-  *is_large = info.is_large;
+  *is_large = info.x.is_large;
  *memid = mi_memid_create(region, bit_idx);
  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);

@ -292,7 +292,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
    bool any_uncommitted;
    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
    if (any_uncommitted) {
-      mi_assert_internal(!info.is_large);
+      mi_assert_internal(!info.x.is_large);
      bool commit_zero;
      _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
      if (commit_zero) *is_zero = true;
@ -307,7 +307,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
  // unreset reset blocks
  if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
    // some blocks are still reset
-    mi_assert_internal(!info.is_large);
+    mi_assert_internal(!info.x.is_large);
    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
@ -412,7 +412,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
    }

    // reset the blocks to reduce the working set.
-    if (!info.is_large && mi_option_is_enabled(mi_option_segment_reset) 
+    if (!info.x.is_large && mi_option_is_enabled(mi_option_segment_reset) 
       && (mi_option_is_enabled(mi_option_eager_commit) ||
           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
    {
--- a/src/options.c
+++ b/src/options.c
@ -67,7 +67,8 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
  { 0, UNINIT, MI_OPTION(allow_decommit) },      // decommit pages when not eager committed
@ -142,7 +143,8 @@ void mi_option_disable(mi_option_t option) {
 }


-static void mi_out_stderr(const char* msg) {
+static void mi_out_stderr(const char* msg, void* arg) {
+  UNUSED(arg);
  #ifdef _WIN32
  // on windows with redirection, the C runtime cannot handle locale dependent output
  // after the main thread closes so we use direct console output.
@ -162,7 +164,8 @@ static void mi_out_stderr(const char* msg) {
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(uintptr_t) out_len;

-static void mi_out_buf(const char* msg) {
+static void mi_out_buf(const char* msg, void* arg) {
+  UNUSED(arg);
  if (msg==NULL) return;
  if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
  size_t n = strlen(msg);
@ -177,14 +180,14 @@ static void mi_out_buf(const char* msg) {
  memcpy(&out_buf[start], msg, n);
 }

-static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
  if (out==NULL) return;
  // claim (if `no_more_buf == true`, no more output will be added after this point)
  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
  // and output the current contents
  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
  out_buf[count] = 0;
-  out(out_buf);
+  out(out_buf,arg);
  if (!no_more_buf) {
    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
  }
@ -193,9 +196,9 @@ static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {

 // Once this module is loaded, switch to this routine
 // which outputs to stderr and the delayed output buffer.
-static void mi_out_buf_stderr(const char* msg) {
-  mi_out_stderr(msg);
-  mi_out_buf(msg);
+static void mi_out_buf_stderr(const char* msg, void* arg) {
+  mi_out_stderr(msg,arg);
+  mi_out_buf(msg,arg);
 }


@ -208,21 +211,25 @@ static void mi_out_buf_stderr(const char* msg) {
 // For now, don't register output from multiple threads.
 #pragma warning(suppress:4180)
 static mi_output_fun* volatile mi_out_default; // = NULL
+static volatile _Atomic(void*) mi_out_arg; // = NULL

-static mi_output_fun* mi_out_get_default(void) {
+static mi_output_fun* mi_out_get_default(void** parg) {
+  if (parg != NULL) { *parg = mi_atomic_read_ptr(&mi_out_arg); }
  mi_output_fun* out = mi_out_default;
  return (out == NULL ? &mi_out_buf : out);
 }

-void mi_register_output(mi_output_fun* out) mi_attr_noexcept {
+void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  if (out!=NULL) mi_out_buf_flush(out,true);             // output all the delayed output now
+  mi_atomic_write_ptr(&mi_out_arg, arg);
+  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
 }

 // add stderr to the delayed output after the module is loaded
 static void mi_add_stderr_output() {
-  mi_out_buf_flush(&mi_out_stderr, false); // flush current contents to stderr
-  mi_out_default = &mi_out_buf_stderr;     // and add stderr to the delayed output
+  mi_assert_internal(mi_out_default == NULL);
+  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
 }

 // --------------------------------------------------------
@ -234,33 +241,35 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;

-void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
+void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
  if (recurse) return;
-  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) out = mi_out_get_default();
+  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+    out = mi_out_get_default(&arg);
+  }
  recurse = true;
-  if (prefix != NULL) out(prefix);
-  out(message);
+  if (prefix != NULL) out(prefix,arg);
+  out(message,arg);
  recurse = false;
  return;
 }

 // Define our own limited `fprintf` that avoids memory allocation.
 // We do this using `snprintf` with a limited buffer.
-static void mi_vfprintf( mi_output_fun* out, const char* prefix, const char* fmt, va_list args ) {
+static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
  char buf[512];
  if (fmt==NULL) return;
  if (recurse) return;
  recurse = true;
  vsnprintf(buf,sizeof(buf)-1,fmt,args);
  recurse = false;
-  _mi_fputs(out,prefix,buf);
+  _mi_fputs(out,arg,prefix,buf);
 }


-void _mi_fprintf( mi_output_fun* out, const char* fmt, ... ) {
+void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
  va_list args;
  va_start(args,fmt);
-  mi_vfprintf(out,NULL,fmt,args);
+  mi_vfprintf(out,arg,NULL,fmt,args);
  va_end(args);
 }

@ -268,7 +277,7 @@ void _mi_trace_message(const char* fmt, ...) {
  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
  va_list args;
  va_start(args, fmt);
-  mi_vfprintf(NULL, "mimalloc: ", fmt, args);
+  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
  va_end(args);
 }

@ -276,18 +285,14 @@ void _mi_verbose_message(const char* fmt, ...) {
  if (!mi_option_is_enabled(mi_option_verbose)) return;
  va_list args;
  va_start(args,fmt);
-  mi_vfprintf(NULL, "mimalloc: ", fmt, args);
+  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
  va_end(args);
 }

-void _mi_error_message(const char* fmt, ...) {
+static void mi_show_error_message(const char* fmt, va_list args) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  va_list args;
-  va_start(args,fmt);
-  mi_vfprintf(NULL, "mimalloc: error: ", fmt, args);
-  va_end(args);
-  mi_assert(false);
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
 }

 void _mi_warning_message(const char* fmt, ...) {
@ -295,26 +300,52 @@ void _mi_warning_message(const char* fmt, ...) {
  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
  va_list args;
  va_start(args,fmt);
-  mi_vfprintf(NULL, "mimalloc: warning: ", fmt, args);
+  mi_vfprintf(NULL, NULL, "mimalloc: warning: ", fmt, args);
  va_end(args);
 }


 #if MI_DEBUG
 void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
-  _mi_fprintf(NULL,"mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
  abort();
 }
 #endif

-mi_attr_noreturn void _mi_fatal_error(const char* fmt, ...) {
+// --------------------------------------------------------
+// Errors
+// --------------------------------------------------------
+
+static mi_error_fun* volatile  mi_error_handler; // = NULL
+static volatile _Atomic(void*) mi_error_arg;     // = NULL
+
+static void mi_error_default(int err) {
+  UNUSED(err);
+#if (MI_SECURE>0)
+  if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
+    abort();
+  }
+#endif
+}
+
+void mi_register_error(mi_error_fun* fun, void* arg) {
+  mi_error_handler = fun;  // can be NULL
+  mi_atomic_write_ptr(&mi_error_arg, arg);
+}
+
+void _mi_error_message(int err, const char* fmt, ...) {
+  // show detailed error message
  va_list args;
  va_start(args, fmt);
-  mi_vfprintf(NULL, "mimalloc: fatal: ", fmt, args);
+  mi_show_error_message(fmt, args);
  va_end(args);
-  #if (MI_SECURE>=0)
-  abort();
-  #endif
+  // and call the error handler which may abort (or return normally)
+  if (mi_error_handler != NULL) {
+    mi_error_handler(err, mi_atomic_read_ptr(&mi_error_arg));
+  }
+  else {
+    mi_error_default(err);
+  }
 }

 // --------------------------------------------------------
--- a/src/os.c
+++ b/src/os.c
@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc-atomic.h"

 #include <string.h>  // strerror
-#include <errno.h>
+

 #if defined(_WIN32)
 #include <windows.h>
@ -654,7 +654,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  if (err != 0) { err = errno; }
  #endif
  if (err != 0) {
-    _mi_warning_message("%s error: start: 0x%p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
+    _mi_warning_message("%s error: start: %p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
    mi_mprotect_hint(err);
  }
  mi_assert_internal(err == 0);
@ -718,7 +718,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
  int err = madvise(start, csize, MADV_DONTNEED);
 #endif
  if (err != 0) {
-    _mi_warning_message("madvise reset error: start: 0x%p, csize: 0x%x, errno: %i\n", start, csize, errno);
+    _mi_warning_message("madvise reset error: start: %p, csize: 0x%x, errno: %i\n", start, csize, errno);
  }
  //mi_assert(err == 0);
  if (err != 0) return false;
@ -773,7 +773,7 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
  if (err != 0) { err = errno; }
 #endif
  if (err != 0) {
-    _mi_warning_message("mprotect error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+    _mi_warning_message("mprotect error: start: %p, csize: 0x%x, err: %i\n", start, csize, err);
    mi_mprotect_hint(err);
  }
  return (err == 0);
@ -960,7 +960,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
    if (p != addr) {
      // no success, issue a warning and break
      if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr);
+        _mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
      }
      break;
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -178,20 +178,20 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif

 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
-  mi_heap_t* heap = page->heap;
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
+  mi_heap_t* heap = mi_page_heap(page);
  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || page->block_size == pq->block_size);
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
  mi_assert_expensive(mi_page_queue_contains(pq, page));
  return pq;
 }

 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
  mi_assert_internal(bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || page->block_size == pq->block_size);
+  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
  return pq;
 }

@ -246,35 +246,37 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_heap_t* heap = mi_page_heap(page);
+
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == queue->last)  queue->last = page->prev;
  if (page == queue->first) {
    queue->first = page->next;
    // update first
-    mi_heap_t* heap = page->heap;
    mi_assert_internal(mi_heap_contains_queue(heap, queue));
    mi_heap_queue_first_update(heap,queue);
  }
-  page->heap->page_count--;
+  heap->page_count--;
  page->next = NULL;
  page->prev = NULL;
-  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }


 static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
-  mi_assert_internal(page->heap == NULL);
+  mi_assert_internal(mi_page_heap(page) == heap);
  mi_assert_internal(!mi_page_queue_contains(queue, page));
+
  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_assert_internal(page->block_size == queue->block_size ||
-                      (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+  mi_assert_internal(page->xblock_size == queue->block_size ||
+                      (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -296,19 +298,20 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
  mi_assert_internal(page != NULL);
  mi_assert_expensive(mi_page_queue_contains(from, page));
  mi_assert_expensive(!mi_page_queue_contains(to, page));
-  mi_assert_internal((page->block_size == to->block_size && page->block_size == from->block_size) ||
-                     (page->block_size == to->block_size && mi_page_queue_is_full(from)) ||
-                     (page->block_size == from->block_size && mi_page_queue_is_full(to)) ||
-                     (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));

+  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
+                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
+                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+
+  mi_heap_t* heap = mi_page_heap(page);
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == from->last)  from->last = page->prev;
  if (page == from->first) {
    from->first = page->next;
    // update first
-    mi_heap_t* heap = page->heap;
    mi_assert_internal(mi_heap_contains_queue(heap, from));
    mi_heap_queue_first_update(heap, from);
  }
@ -316,14 +319,14 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
  page->prev = to->last;
  page->next = NULL;
  if (to->last != NULL) {
-    mi_assert_internal(page->heap == to->last->heap);
+    mi_assert_internal(heap == mi_page_heap(to->last));
    to->last->next = page;
    to->last = page;
  }
  else {
    to->first = page;
    to->last = page;
-    mi_heap_queue_first_update(page->heap, to);
+    mi_heap_queue_first_update(heap, to);
  }

  mi_page_set_in_full(page, mi_page_queue_is_full(to));
@ -338,7 +341,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+    mi_page_set_heap(page,heap);
    count++;
  }

--- a/src/page.c
+++ b/src/page.c
@ -29,10 +29,11 @@ terms of the MIT license. A copy of the license can be found in the file
 ----------------------------------------------------------- */

 // Index a block in a page
-static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t i) {
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
+  UNUSED(page);
  mi_assert_internal(page != NULL);
  mi_assert_internal(i <= page->reserved);
-  return (mi_block_t*)((uint8_t*)page_start + (i * page->block_size));
+  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
 }

 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
@ -69,15 +70,16 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
 }

 static bool mi_page_is_valid_init(mi_page_t* page) {
-  mi_assert_internal(page->block_size > 0);
+  mi_assert_internal(page->xblock_size > 0);
  mi_assert_internal(page->used <= page->capacity);
  mi_assert_internal(page->capacity <= page->reserved);

+  const size_t bsize = mi_page_block_size(page);
  mi_segment_t* segment = _mi_page_segment(page);
  uint8_t* start = _mi_page_start(segment,page,NULL);
-
  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
  mi_assert_internal(mi_page_list_is_valid(page,page->free));
  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));

@ -89,10 +91,10 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  }
  #endif

-  mi_block_t* tfree = mi_tf_block(page->thread_free);
+  mi_block_t* tfree = mi_page_thread_free(page);
  mi_assert_internal(mi_page_list_is_valid(page, tfree));
-  size_t tfree_count = mi_page_list_count(page, tfree);
-  mi_assert_internal(tfree_count <= page->thread_freed + 1);
+  //size_t tfree_count = mi_page_list_count(page, tfree);
+  //mi_assert_internal(tfree_count <= page->thread_freed + 1);

  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
  mi_assert_internal(page->used + free_count == page->capacity);
@ -105,14 +107,15 @@ bool _mi_page_is_valid(mi_page_t* page) {
  #if MI_SECURE
  mi_assert_internal(page->key != 0);
  #endif
-  if (page->heap!=NULL) {
+  if (mi_page_heap(page)!=NULL) {
    mi_segment_t* segment = _mi_page_segment(page);
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == page->heap->thread_id);
+
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
    if (segment->kind != MI_SEGMENT_HUGE) {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==page->block_size || page->block_size > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
-      mi_assert_internal(mi_heap_contains_queue(page->heap,pq));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
  return true;
@ -124,20 +127,20 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
  mi_thread_free_t tfreex;
  mi_delayed_t     old_delay;
  do {
-    tfree = mi_atomic_read_relaxed(&page->thread_free);
+    tfree = mi_atomic_read(&page->xthread_free);
    tfreex = mi_tf_set_delayed(tfree, delay);
    old_delay = mi_tf_delayed(tfree);
    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
-      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      // mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
    }
    else if (delay == old_delay) {
      break; // avoid atomic operation if already equal
    }
    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
-      break; // leave never set
+      break; // leave never-delayed flag set
    }
-  } while ((old_delay == MI_DELAYED_FREEING) ||
-    !mi_atomic_cas_weak(mi_atomic_cast(uintptr_t, &page->thread_free), tfreex, tfree));
+  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
 }

 /* -----------------------------------------------------------
@ -154,17 +157,17 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  mi_thread_free_t tfree;
  mi_thread_free_t tfreex;
  do {
-    tfree = page->thread_free;
+    tfree = mi_atomic_read_relaxed(&page->xthread_free);
    head = mi_tf_block(tfree);
    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));

  // return if the list is empty
  if (head == NULL) return;

  // find the tail -- also to get a proper count (without data races)
-  uintptr_t max_count = page->capacity; // cannot collect more than capacity
-  uintptr_t count = 1;
+  uint32_t max_count = page->capacity; // cannot collect more than capacity
+  uint32_t count = 1;
  mi_block_t* tail = head;
  mi_block_t* next;
  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
@ -173,7 +176,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  }
  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
  if (count > max_count) {
-    _mi_fatal_error("corrupted thread-free list\n");
+    _mi_error_message(EFAULT, "corrupted thread-free list\n");
    return; // the thread-free items cannot be freed
  }

@ -182,7 +185,6 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
  page->local_free = head;

  // update counts now
-  mi_atomic_subu(&page->thread_freed, count);
  page->used -= count;
 }

@ -190,7 +192,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
  mi_assert_internal(page!=NULL);

  // collect the thread free list
-  if (force || mi_tf_block(page->thread_free) != NULL) {  // quick test to avoid an atomic operation
+  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
    _mi_page_thread_free_collect(page);
  }

@ -228,17 +230,14 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
-  mi_assert_internal(page->heap == NULL);

+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
-  mi_assert_internal(!page->is_reset);  
-  mi_assert_internal(mi_tf_delayed(page->thread_free) == MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(!page->is_reset);
+  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));

-  _mi_page_free_collect(page,false);
-  mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
  mi_page_queue_push(heap, pq, page);
-  mi_assert_internal(page->heap != NULL);
-  _mi_page_use_delayed_free(page, MI_NO_DELAYED_FREE, true); // override never (after push so heap is set)
  mi_assert_expensive(_mi_page_is_valid(page));
 }

@ -272,8 +271,8 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
  // otherwise allocate the page
  page = mi_page_fresh_alloc(heap, pq, pq->block_size);
  if (page==NULL) return NULL;
-  mi_assert_internal(pq->block_size==page->block_size);
-  mi_assert_internal(pq==mi_page_queue(heap,page->block_size));
+  mi_assert_internal(pq->block_size==mi_page_block_size(page));
+  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
  return page;
 }

@ -314,11 +313,9 @@ void _mi_page_unfull(mi_page_t* page) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_is_in_full(page));
-
-  _mi_page_use_delayed_free(page, MI_NO_DELAYED_FREE, false);
  if (!mi_page_is_in_full(page)) return;

-  mi_heap_t* heap = page->heap;
+  mi_heap_t* heap = mi_page_heap(page);
  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
  mi_page_set_in_full(page, false); // to get the right queue
  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
@ -331,10 +328,8 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(!mi_page_immediate_available(page));
  mi_assert_internal(!mi_page_is_in_full(page));

-  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
  if (mi_page_is_in_full(page)) return;
-
-  mi_page_queue_enqueue_from(&page->heap->pages[MI_BIN_FULL], pq, page);
+  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
 }

@ -347,18 +342,17 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
-  mi_assert_internal(page->heap != NULL);
+  mi_assert_internal(mi_page_heap(page) != NULL);

-#if MI_DEBUG > 1
-  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
-#endif
+  mi_heap_t* pheap = mi_page_heap(page);

  // remove from our page list
-  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
  mi_page_queue_remove(pq, page);

  // page is no longer associated with our heap
-  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_page_set_heap(page, NULL);

 #if MI_DEBUG>1
  // check there are no references left..
@ -368,7 +362,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 #endif

  // and abandon it
-  mi_assert_internal(page->heap == NULL);
+  mi_assert_internal(mi_page_heap(page) == NULL);
  _mi_segment_page_abandon(page,segments_tld);
 }

@ -379,32 +373,18 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(mi_page_all_free(page));
-  #if MI_DEBUG>1
-  // check if we can safely free
-  mi_thread_free_t free = mi_tf_set_delayed(page->thread_free,MI_NEVER_DELAYED_FREE);
-  free = mi_atomic_exchange(&page->thread_free, free);
-  mi_assert_internal(mi_tf_delayed(free) != MI_DELAYED_FREEING);
-  #endif
+  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);

+  // no more aligned blocks in here
  mi_page_set_has_aligned(page, false);

-  // account for huge pages here
-  if (page->block_size > MI_MEDIUM_OBJ_SIZE_MAX) {
-    if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-      _mi_stat_decrease(&page->heap->tld->stats.large, page->block_size);
-    }
-    else {
-      _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
-    }
-  }
-
  // remove from the page list
  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments;
  mi_page_queue_remove(pq, page);

  // and free it
-  mi_assert_internal(page->heap == NULL);
+  mi_page_set_heap(page,NULL);
  _mi_segment_page_free(page, force, segments_tld);
 }

@ -428,10 +408,10 @@ void _mi_page_retire(mi_page_t* page) {
  // how to check this efficiently though...
  // for now, we don't retire if it is the only page left of this size class.
  mi_page_queue_t* pq = mi_page_queue_of(page);
-  if (mi_likely(page->block_size <= MI_SMALL_SIZE_MAX)) {
+  if (mi_likely(page->xblock_size <= MI_SMALL_SIZE_MAX && !mi_page_is_in_full(page))) {
    if (pq->last==page && pq->first==page) { // the only page in the queue?
      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 4;
+      page->retire_expire = 16;
      mi_assert_internal(mi_page_all_free(page));
      return; // dont't free after all
    }
@ -469,15 +449,15 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 #define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
 #define MI_MIN_SLICES       (2)

-static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
  UNUSED(stats);
  #if (MI_SECURE<=2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
  #endif
  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
-  const size_t bsize = page->block_size;

  // initialize a randomized free list
  // set up `slice_count` slices to alternate between
@ -491,7 +471,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
  for (size_t i = 0; i < slice_count; i++) {
-    blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend);
+    blocks[i] = mi_page_block_at(page, page_area, bsize, page->capacity + i*slice_extend);
    counts[i] = slice_extend;
  }
  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
@ -526,7 +506,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
  page->free = free_start;
 }

-static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
  UNUSED(stats);
  #if (MI_SECURE <= 2)
@ -534,12 +514,13 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
  mi_assert_internal(page->local_free == NULL);
  #endif
  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
-  const size_t bsize = page->block_size;
-  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
+
+  mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);

  // initialize a sequential free list
-  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);
+  mi_block_t* const last = mi_page_block_at(page, page_area, bsize, page->capacity + extend - 1);
  mi_block_t* block = start;
  while(block <= last) {
    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
@ -582,8 +563,9 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  mi_stat_counter_increase(tld->stats.pages_extended, 1);

  // calculate the extend count
+  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
  size_t extend = page->reserved - page->capacity;
-  size_t max_extend = (page->block_size >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)page->block_size);
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
  if (max_extend < MI_MIN_EXTEND) max_extend = MI_MIN_EXTEND;

  if (extend > max_extend) {
@ -597,14 +579,14 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)

  // and append the extend the free list
  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, extend, &tld->stats );
+    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
  }
  else {
-    mi_page_free_list_extend_secure(heap, page, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
  }
  // enable the new free list
  page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * page->block_size);
+  mi_stat_increase(tld->stats.page_committed, extend * bsize);

  // extension into zero initialized memory preserves the zero'd free list
  if (!page->is_zero_init) {
@ -620,12 +602,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert(segment != NULL);
  mi_assert_internal(block_size > 0);
  // set fields
+  mi_page_set_heap(page, heap);
  size_t page_size;
-
  _mi_segment_page_start(segment, page, &page_size);
-
-  page->block_size = block_size;
-  mi_assert_internal(page->block_size <= page_size);
+  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(mi_page_block_size(page) <= page_size);
  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
  mi_assert_internal(page_size / block_size < (1L<<16));
  page->reserved = (uint16_t)(page_size / block_size);
@ -638,14 +619,14 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->capacity == 0);
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->used == 0);
-  mi_assert_internal(page->thread_free == 0);
-  mi_assert_internal(page->thread_freed == 0);
+  mi_assert_internal(page->xthread_free == 0);
  mi_assert_internal(page->next == NULL);
  mi_assert_internal(page->prev == NULL);
  mi_assert_internal(page->retire_expire == 0);
  mi_assert_internal(!mi_page_has_aligned(page));
  #if (MI_ENCODE_FREELIST)
-  mi_assert_internal(page->key != 0);
+  mi_assert_internal(page->key[0] != 0);
+  mi_assert_internal(page->key[1] != 0);
  #endif
  mi_assert_expensive(mi_page_is_valid_init(page));

@ -663,34 +644,19 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq)
 {
  // search through the pages in "next fit" order
-  mi_page_t* rpage = NULL;
  size_t count = 0;
-  size_t page_free_count = 0;
  mi_page_t* page = pq->first;
-  while( page != NULL)
+  while (page != NULL)
  {
    mi_page_t* next = page->next; // remember next
    count++;

    // 0. collect freed blocks by us and other threads
-    _mi_page_free_collect(page,false);
+    _mi_page_free_collect(page, false);

    // 1. if the page contains free blocks, we are done
    if (mi_page_immediate_available(page)) {
-      // If all blocks are free, we might retire this page instead.
-      // do this at most 8 times to bound allocation time.
-      // (note: this can happen if a page was earlier not retired due
-      //  to having neighbours that were mostly full or due to concurrent frees)
-      if (page_free_count < 8 && mi_page_all_free(page)) {
-        page_free_count++;
-        if (rpage != NULL) _mi_page_free(rpage,pq,false);
-        rpage = page;
-        page = next;
-        continue;     // and keep looking
-      }
-      else {
-        break;  // pick this one
-      }
+      break;  // pick this one
    }

    // 2. Try to extend
@ -703,22 +669,15 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
    // 3. If the page is completely full, move it to the `mi_pages_full`
    // queue so we don't visit long-lived pages too often.
    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
-    mi_page_to_full(page,pq);
+    mi_page_to_full(page, pq);

    page = next;
  } // for each page

-  mi_stat_counter_increase(heap->tld->stats.searches,count);
-
-  if (page == NULL) {
-    page = rpage;
-    rpage = NULL;
-  }
-  if (rpage != NULL) {
-    _mi_page_free(rpage,pq,false);
-  }
+  mi_stat_counter_increase(heap->tld->stats.searches, count);

  if (page == NULL) {
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
  }
  else {
@ -727,12 +686,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
  }
  mi_assert_internal(page == NULL || mi_page_immediate_available(page));

-  // finally collect retired pages
-  _mi_heap_collect_retired(heap,false);
  return page;
 }


+
 // Find a page with free blocks of `size`.
 static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
  mi_page_queue_t* pq = mi_page_queue(heap,size);
@ -763,18 +721,20 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
 ----------------------------------------------------------- */

 static mi_deferred_free_fun* volatile deferred_free = NULL;
+static volatile _Atomic(void*) deferred_arg; // = NULL

 void _mi_deferred_free(mi_heap_t* heap, bool force) {
  heap->tld->heartbeat++;
  if (deferred_free != NULL && !heap->tld->recurse) {
    heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat);
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(&deferred_arg));
    heap->tld->recurse = false;
  }
 }

-void mi_register_deferred_free(mi_deferred_free_fun* fn) mi_attr_noexcept {
+void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
  deferred_free = fn;
+  mi_atomic_write_ptr(&deferred_arg, arg);
 }


@ -794,20 +754,21 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size);
  if (page != NULL) {
+    const size_t bsize = mi_page_block_size(page);
    mi_assert_internal(mi_page_immediate_available(page));
-    mi_assert_internal(page->block_size == block_size);
+    mi_assert_internal(bsize >= size);

    if (pq == NULL) {
      // huge pages are directly abandoned
      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
      mi_assert_internal(_mi_page_segment(page)->used==1);
      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
-      page->heap = NULL;
+      mi_page_set_heap(page, NULL);
    }
    else {
      mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
    }
-    if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
      _mi_stat_increase(&heap->tld->stats.large, block_size);
      _mi_stat_counter_increase(&heap->tld->stats.large_count, 1);
    }
@ -842,7 +803,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
  mi_page_t* page;
  if (mi_unlikely(size > MI_MEDIUM_OBJ_SIZE_MAX)) {
    if (mi_unlikely(size > PTRDIFF_MAX)) {
-      page = NULL;
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", size);
+      return NULL;
    }
    else {
      page = mi_large_huge_page_alloc(heap,size);
@ -852,10 +814,13 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
    // otherwise find a page with free blocks in our size segregated queues
    page = mi_find_free_page(heap,size);
  }
-  if (page == NULL) return NULL; // out of memory
+  if (mi_unlikely(page == NULL)) { // out of memory
+    _mi_error_message(ENOMEM, "cannot allocate memory (%zu bytes requested)\n", size);
+    return NULL;
+  }

  mi_assert_internal(mi_page_immediate_available(page));
-  mi_assert_internal(page->block_size >= size);
+  mi_assert_internal(mi_page_block_size(page) >= size);

  // and try again, this time succeeding! (i.e. this should never recurse)
  return _mi_page_malloc(heap, page, size);
--- a/src/segment.c
+++ b/src/segment.c
@ -33,6 +33,7 @@ static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_st
   Slices
 ----------------------------------------------------------- */

+
 static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
  return &segment->slices[segment->slice_entries];
 }
@ -106,7 +107,7 @@ static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
  sq->first = slice;
  if (slice->next != NULL) slice->next->prev = slice;
                     else sq->last = slice;
-  slice->block_size = 0; // free
+  slice->xblock_size = 0; // free
 }

 static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
@ -117,7 +118,7 @@ static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t*
 }

 static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
-  mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
  // should work too if the queue does not contain slice (which can happen during reclaim)
  if (slice->prev != NULL) slice->prev->next = slice->next;
  if (slice == sq->first) sq->first = slice->next;
@ -125,7 +126,7 @@ static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
  if (slice == sq->last) sq->last = slice->prev;
  slice->prev = NULL;
  slice->next = NULL;
-  slice->block_size = 1; // no more free
+  slice->xblock_size = 1; // no more free
 }


@ -156,26 +157,26 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
    mi_assert_internal(slice->slice_offset == 0);
    size_t index = mi_slice_index(slice);
    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
-    if (slice->block_size > 0) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
+    if (slice->xblock_size > 0) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
      used_count++;
      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
        mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
        mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
-        mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1);
+        mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1);
      }
      // and the last entry as well (for coalescing)
      const mi_slice_t* last = slice + slice->slice_count - 1;
      if (last > slice && last < mi_segment_slices_end(segment)) {
        mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
        mi_assert_internal(last->slice_count == 0);
-        mi_assert_internal(last->block_size == 1);
+        mi_assert_internal(last->xblock_size == 1);
      }
    }
    else {  // free range of slices; only last slice needs a valid back offset
      mi_slice_t* last = &segment->slices[maxindex];
      mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
      mi_assert_internal(slice == last || last->slice_count == 0 );
-      mi_assert_internal(last->block_size == 0);
+      mi_assert_internal(last->xblock_size == 0);
      if (segment->kind == MI_SEGMENT_NORMAL && segment->thread_id != 0) { // segment is not huge or abandonded
        sq = mi_span_queue_for(slice->slice_count,tld);
        mi_assert_internal(mi_span_queue_contains(sq,slice));
@ -234,7 +235,7 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
  */

  if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(page->xblock_size == 0 || _mi_ptr_page(p) == page);
  mi_assert_internal(_mi_ptr_segment(p) == segment);
  return p;
 }
@ -351,7 +352,7 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
    mi_segment_os_free(segment, tld);
  }
  mi_assert_internal(tld->cache_count == 0);
-  mi_assert_internal(tld->cache == NULL);
+  mi_assert_internal(tld->cache == NULL);  
 }


@ -489,7 +490,7 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
    last->slice_count = 0;
    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
-    last->block_size = 0;
+    last->xblock_size = 0;
  }

  // perhaps decommit
@ -497,19 +498,19 @@ static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size
  
  // and push it on the free page queue (if it was not a huge page)
  if (sq != NULL) mi_span_queue_push( sq, slice );
-             else slice->block_size = 0; // mark huge page as free anyways
+             else slice->xblock_size = 0; // mark huge page as free anyways
 }

 // called from reclaim to add existing free spans
 static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
  mi_segment_t* segment = _mi_ptr_segment(slice);
-  mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
  size_t slice_index = mi_slice_index(slice);
  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
 }

 static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->block_size==0);
+  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0);
  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
  mi_span_queue_delete(sq, slice);
@ -517,7 +518,7 @@ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld


 static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
-  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0 && slice->block_size > 0);
+  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0 && slice->xblock_size > 0);
  mi_segment_t* segment = _mi_ptr_segment(slice);
  mi_assert_internal(segment->used > 0);
  segment->used--;
@ -525,7 +526,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
  // for huge pages, just mark as free but don't add to the queues
  if (segment->kind == MI_SEGMENT_HUGE) {
    mi_assert_internal(segment->used == 0);
-    slice->block_size = 0;  // mark as free anyways
+    slice->xblock_size = 0;  // mark as free anyways
    return slice;
  }

@ -533,7 +534,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
  size_t slice_count = slice->slice_count;
  mi_slice_t* next = slice + slice->slice_count;
  mi_assert_internal(next <= mi_segment_slices_end(segment));
-  if (next < mi_segment_slices_end(segment) && next->block_size==0) {
+  if (next < mi_segment_slices_end(segment) && next->xblock_size==0) {
    // free next block -- remove it from free and merge
    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
    slice_count += next->slice_count; // extend
@ -542,7 +543,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
  if (slice > segment->slices) {
    mi_slice_t* prev = mi_slice_first(slice - 1);
    mi_assert_internal(prev >= segment->slices);
-    if (prev->block_size==0) {
+    if (prev->xblock_size==0) {
      // free previous slice -- remove it from free and merge
      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
      slice_count += prev->slice_count;
@ -561,7 +562,7 @@ static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_
 static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
  mi_assert_internal(_mi_ptr_segment(slice)==segment);
  mi_assert_internal(slice->slice_count >= slice_count);
-  mi_assert_internal(slice->block_size > 0); // no more in free queue
+  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
  if (slice->slice_count <= slice_count) return;
  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
  size_t next_index = mi_slice_index(slice) + slice_count;
@ -574,12 +575,14 @@ static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, siz
 static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
  mi_assert_internal(slice_index < segment->slice_entries);
  mi_slice_t* slice = &segment->slices[slice_index];
-  mi_assert_internal(slice->block_size==0 || slice->block_size==1);
+  mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
  slice->slice_offset = 0;
  slice->slice_count = (uint32_t)slice_count;
  mi_assert_internal(slice->slice_count == slice_count);
-  slice->block_size = slice_count * MI_SEGMENT_SLICE_SIZE;
+  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
+  slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize);
  mi_page_t*  page = mi_slice_to_page(slice);
+  mi_assert_internal(mi_page_block_size(page) == bsize);

  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
  size_t extra = slice_count-1;
@ -589,7 +592,7 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
  for (size_t i = 1; i <= extra; i++, slice++) {
    slice->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
    slice->slice_count = 0;
-    slice->block_size = 1;
+    slice->xblock_size = 1;
  }

  // and also for the last one (if not set already) (the last one is needed for coalescing)
@ -597,7 +600,7 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
  if (last < mi_segment_slices_end(segment) && last >= slice) {
    last->slice_offset = (uint32_t)(sizeof(mi_slice_t)*(slice_count-1));
    last->slice_count = 0;
-    last->block_size = 1;
+    last->xblock_size = 1;
  }

  // ensure the memory is committed
@ -621,7 +624,7 @@ static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_segm
        if (slice->slice_count > slice_count) {
          mi_segment_slice_split(segment, slice, slice_count, tld);
        }
-        mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0);
+        mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
        return mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
      }
    }
@ -746,8 +749,8 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
  while (slice < end) {
    mi_assert_internal(slice->slice_count > 0);
    mi_assert_internal(slice->slice_offset == 0);
-    mi_assert_internal(mi_slice_index(slice)==0 || slice->block_size == 0); // no more used pages ..
-    if (slice->block_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
+    mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages ..
+    if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
      mi_segment_span_remove_from_queue(slice, tld);
    }
    page_count++;
@ -800,15 +803,16 @@ static mi_page_t* mi_segments_page_alloc(mi_page_kind_t page_kind, size_t requir
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);

 static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert_internal(page->block_size > 0);
+  mi_assert_internal(page->xblock_size > 0);
  mi_assert_internal(mi_page_all_free(page));
  mi_segment_t* segment = _mi_ptr_segment(page);
  
-  size_t inuse = page->capacity * page->block_size;
+  size_t inuse = page->capacity * mi_page_block_size(page);
  _mi_stat_decrease(&tld->stats->page_committed, inuse);
  _mi_stat_decrease(&tld->stats->pages, 1);

  // reset the page memory to reduce memory pressure?
+
  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
    size_t psize;
    uint8_t* start = _mi_page_start(segment, page, &psize);
@ -820,7 +824,7 @@ static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld
  page->is_zero_init = false;
  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
-  page->block_size = 1;
+  page->xblock_size = 1;

  // and free it
  return mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);  
@ -891,9 +895,9 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
  while (slice < end) {
    mi_assert_internal(slice->slice_count > 0);
    mi_assert_internal(slice->slice_offset == 0);
-    if (slice->block_size == 0) { // a free page
+    if (slice->xblock_size == 0) { // a free page
      mi_segment_span_remove_from_queue(slice,tld);
-      slice->block_size = 0; // but keep it free
+      slice->xblock_size = 0; // but keep it free
    }
    slice = slice + slice->slice_count;
  }
@ -912,7 +916,10 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {

 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
  mi_assert(page != NULL);
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(mi_page_heap(page) == NULL);
  mi_segment_t* segment = _mi_page_segment(page);
+
  mi_assert_expensive(mi_segment_is_valid(segment,tld));
  segment->abandoned++;  
  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
@ -958,42 +965,48 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
    mi_atomic_decrement(&abandoned_count);
    mi_assert_expensive(mi_segment_is_valid(segment, tld));
    segment->abandoned_next = NULL;
+
    segment->thread_id = _mi_thread_id();
    mi_segments_track_size((long)mi_segment_size(segment),tld);
    mi_assert_internal(segment->next == NULL);
+
    _mi_stat_decrease(&tld->stats->segments_abandoned,1);
    //mi_assert_internal(segment->decommit_mask == 0);

    mi_slice_t* slice = &segment->slices[0];
    const mi_slice_t* end = mi_segment_slices_end(segment);
-    mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page
+    mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
    slice = slice + slice->slice_count; // skip the first segment allocated page
    while (slice < end) {
      mi_assert_internal(slice->slice_count > 0);
      mi_assert_internal(slice->slice_offset == 0);
-      if (slice->block_size == 0) { // a free page, add it to our lists
+      if (slice->xblock_size == 0) { // a free page, add it to our lists
        mi_segment_span_add_free(slice,tld);
      }
      slice = slice + slice->slice_count;
    }

    slice = &segment->slices[0];
-    mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page
+    mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
    slice = slice + slice->slice_count; // skip the first segment allocated page
    while (slice < end) {
      mi_assert_internal(slice->slice_count > 0);
      mi_assert_internal(slice->slice_offset == 0);
      mi_page_t* page = mi_slice_to_page(slice);
-      if (page->block_size > 0) { // a used page
+      if (page->xblock_size > 0) { // a used page
        mi_assert_internal(page->next == NULL && page->prev==NULL);
        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
        segment->abandoned--;
+        // set the heap again and allow delayed free again
+        mi_page_set_heap(page, heap);
+        _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+        _mi_page_free_collect(page, false); // ensure used count is up to date
        if (mi_page_all_free(page)) {
          // if everything free by now, free the page
          slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
        }
        else {
-          // otherwise reclaim it
+          // otherwise reclaim it into the heap
          _mi_page_reclaim(heap,page);
        }
      }
@ -1024,7 +1037,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
  mi_segment_t* segment = mi_segment_alloc(size,tld,os_tld,&page);
  if (segment == NULL || page==NULL) return NULL;
  mi_assert_internal(segment->used==1);
-  mi_assert_internal(page->block_size >= size);
+  mi_assert_internal(mi_page_block_size(page) >= size);
  segment->thread_id = 0; // huge segments are immediately abandoned
  return page;
 }
@ -1167,5 +1180,11 @@ static void* mi_segment_range_of(const void* p, size_t* size) {
    if (size != NULL) *size = segment->segment_size;
    return segment;
  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+>>>>>>> dev
 }
 */
--- a/src/stats.c
+++ b/src/stats.c
@ -126,7 +126,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit > 0 : size in binary bytes 
 // unit == 0: count as decimal
 // unit < 0 : count in binary
-static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const char* fmt) {
+static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
  char buf[32];
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "b");
@ -147,75 +147,75 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const
    const long frac1 = (long)(tens%10);
    snprintf(buf, len, "%ld.%ld %s%s", whole, frac1, magnitude, suffix);
  }
-  _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
+  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
 }


-static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out) {
-  mi_printf_amount(n,unit,out,NULL);
+static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_printf_amount(n,unit,out,arg,NULL);
 }

-static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out) {
-  if (unit==1) _mi_fprintf(out,"%11s"," ");
-          else mi_print_amount(n,0,out);
+static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+          else mi_print_amount(n,0,out,arg);
 }

-static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out ) {
-  _mi_fprintf(out,"%10s:", msg);  
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg,"%10s:", msg);
  if (unit>0) {
-    mi_print_amount(stat->peak, unit, out);
-    mi_print_amount(stat->allocated, unit, out);
-    mi_print_amount(stat->freed, unit, out);
-    mi_print_amount(unit, 1, out);
-    mi_print_count(stat->allocated, unit, out);
+    mi_print_amount(stat->peak, unit, out, arg);
+    mi_print_amount(stat->allocated, unit, out, arg);
+    mi_print_amount(stat->freed, unit, out, arg);
+    mi_print_amount(unit, 1, out, arg);
+    mi_print_count(stat->allocated, unit, out, arg);
    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, "  not all freed!\n");
+      _mi_fprintf(out, arg, "  not all freed!\n");
    else
-      _mi_fprintf(out, "  ok\n");
+      _mi_fprintf(out, arg, "  ok\n");
  }
  else if (unit<0) {
-    mi_print_amount(stat->peak, -1, out);
-    mi_print_amount(stat->allocated, -1, out);
-    mi_print_amount(stat->freed, -1, out);
+    mi_print_amount(stat->peak, -1, out, arg);
+    mi_print_amount(stat->allocated, -1, out, arg);
+    mi_print_amount(stat->freed, -1, out, arg);
    if (unit==-1) {
-      _mi_fprintf(out, "%22s", "");
+      _mi_fprintf(out, arg, "%22s", "");
    }
    else {
-      mi_print_amount(-unit, 1, out);
-      mi_print_count((stat->allocated / -unit), 0, out);
+      mi_print_amount(-unit, 1, out, arg);
+      mi_print_count((stat->allocated / -unit), 0, out, arg);
    }
    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, "  not all freed!\n");
+      _mi_fprintf(out, arg, "  not all freed!\n");
    else
-      _mi_fprintf(out, "  ok\n");
+      _mi_fprintf(out, arg, "  ok\n");
  }
  else {
-    mi_print_amount(stat->peak, 1, out);
-    mi_print_amount(stat->allocated, 1, out);
-    _mi_fprintf(out, "\n");
+    mi_print_amount(stat->peak, 1, out, arg);
+    mi_print_amount(stat->allocated, 1, out, arg);
+    _mi_fprintf(out, arg, "\n");
  }
 }

-static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out ) {
-  _mi_fprintf(out, "%10s:", msg);
-  mi_print_amount(stat->total, -1, out);
-  _mi_fprintf(out, "\n");
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out, arg);
+  _mi_fprintf(out, arg, "\n");
 }

-static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out) {
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
  const long avg_whole = (long)(avg_tens/10);
  const long avg_frac1 = (long)(avg_tens%10);
-  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }


-static void mi_print_header(mi_output_fun* out ) {
-  _mi_fprintf(out,"%10s: %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "unit  ", "count  ");
+static void mi_print_header(mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "unit  ", "count  ");
 }

 #if MI_STAT>1
-static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out) {
+static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
  bool found = false;
  char buf[64];
  for (size_t i = 0; i <= max; i++) {
@ -224,14 +224,14 @@ static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bin
      int64_t unit = _mi_bin_size((uint8_t)i);
      snprintf(buf, 64, "%s %3zu", fmt, i);
      mi_stat_add(all, &bins[i], unit);
-      mi_stat_print(&bins[i], buf, unit, out);
+      mi_stat_print(&bins[i], buf, unit, out, arg);
    }
  }
  //snprintf(buf, 64, "%s all", fmt);
  //mi_stat_print(all, buf, 1);
  if (found) {
-    _mi_fprintf(out, "\n");
-    mi_print_header(out);
+    _mi_fprintf(out, arg, "\n");
+    mi_print_header(out, arg);
  }
 }
 #endif
@ -239,40 +239,40 @@ static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bin

 static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);

-static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) mi_attr_noexcept {
-  mi_print_header(out);
+static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_print_header(out,arg);
  #if MI_STAT>1
  mi_stat_count_t normal = { 0,0,0,0 };
-  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out);
-  mi_stat_print(&normal, "normal", 1, out);
-  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out);
+  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out,arg);
+  mi_stat_print(&normal, "normal", 1, out, arg);
+  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
  mi_stat_count_t total = { 0,0,0,0 };
  mi_stat_add(&total, &normal, 1);
-  mi_stat_add(&total, &stats->huge, 1);
  mi_stat_add(&total, &stats->large, 1);
-  mi_stat_print(&total, "total", 1, out);
-  _mi_fprintf(out, "malloc requested:     ");
-  mi_print_amount(stats->malloc.allocated, 1, out);
-  _mi_fprintf(out, "\n\n");
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_print(&total, "total", 1, out, arg);
+  _mi_fprintf(out, arg, "malloc requested:     ");
+  mi_print_amount(stats->malloc.allocated, 1, out, arg);
+  _mi_fprintf(out, arg, "\n\n");
  #endif
-  mi_stat_print(&stats->reserved, "reserved", 1, out);
-  mi_stat_print(&stats->committed, "committed", 1, out);
-  mi_stat_print(&stats->reset, "reset", 1, out);
-  mi_stat_print(&stats->page_committed, "touched", 1, out);
-  mi_stat_print(&stats->segments, "segments", -1, out);
-  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out);
-  mi_stat_print(&stats->segments_cache, "-cached", -1, out);
-  mi_stat_print(&stats->pages, "pages", -1, out);
-  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out);
-  mi_stat_counter_print(&stats->pages_extended, "-extended", out);
-  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out);
-  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out);
-  mi_stat_counter_print(&stats->commit_calls, "commits", out);
-  mi_stat_print(&stats->threads, "threads", -1, out);
-  mi_stat_counter_print_avg(&stats->searches, "searches", out);
-  _mi_fprintf(out, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
-  if (elapsed > 0) _mi_fprintf(out, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
+  mi_stat_print(&stats->committed, "committed", 1, out, arg);
+  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
+  mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_print(&stats->threads, "threads", -1, out, arg);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
+  _mi_fprintf(out, arg, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
+  if (elapsed > 0) _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);

  mi_msecs_t user_time;
  mi_msecs_t sys_time;
@ -281,13 +281,13 @@ static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun
  size_t page_reclaim;
  size_t peak_commit;
  mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim, &peak_commit);
-  _mi_fprintf(out,"%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, reclaims: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults, (unsigned long)page_reclaim );
-  mi_printf_amount((int64_t)peak_rss, 1, out, "%s");
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, reclaims: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
  if (peak_commit > 0) {
-    _mi_fprintf(out,", commit charge: ");
-    mi_printf_amount((int64_t)peak_commit, 1, out, "%s");
+    _mi_fprintf(out, arg, ", commit charge: ");
+    mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
  }
-  _mi_fprintf(out,"\n");
+  _mi_fprintf(out, arg, "\n");
 }

 static mi_msecs_t mi_time_start; // = 0
@ -319,20 +319,20 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
  mi_stats_merge_from(stats);
 }

-
-static void mi_stats_print_ex(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) {
-  mi_stats_merge_from(stats);
-  _mi_stats_print(&_mi_stats_main, elapsed, out);
+void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
+  mi_stats_merge_from(mi_stats_get_default());
+  _mi_stats_print(&_mi_stats_main, elapsed, out, arg);
 }

-void mi_stats_print(mi_output_fun* out) mi_attr_noexcept {
-  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
-  mi_stats_print_ex(mi_stats_get_default(),elapsed,out);
+void mi_stats_print(void* out) mi_attr_noexcept {
+  // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
+  mi_stats_print_out((mi_output_fun*)out, NULL);
 }

-void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept {
+void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
-  _mi_stats_print(mi_stats_get_default(), elapsed, out);
+  _mi_stats_print(mi_stats_get_default(), elapsed, out, arg);
 }