merge from dev

2025-08-25 00:34:48 +03:00 · 2020-03-16 16:41:21 -07:00 · 2020-03-16 16:41:21 -07:00 · 1f396e64a0
commit 1f396e64a0
parent d221a4b904 980d343f39
34 changed files with 2041 additions and 554 deletions
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -18,20 +18,22 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  // note: we don't require `size > offset`, we just guarantee that
  // the address at offset is aligned regardless of the allocated size.
  mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
+
  if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
  
  // try if there is a small block available with just the right alignment
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+  const size_t padsize = size + MI_PADDING_SIZE;
+  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap,padsize);
    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
    if (mi_likely(page->free != NULL && is_aligned))
    {
      #if MI_STAT>1
      mi_heap_stat_increase( heap, malloc, size);
      #endif
-      void* p = _mi_page_malloc(heap,page,size); // TODO: inline _mi_page_malloc
+      void* p = _mi_page_malloc(heap,page,padsize); // TODO: inline _mi_page_malloc
      mi_assert_internal(p != NULL);
      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
      if (zero) _mi_block_zero_init(page,p,size);
@ -40,7 +42,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  }

  // use regular allocation if it is guaranteed to fit the alignment constraints
-  if (offset==0 && alignment<=size && size<=MI_MEDIUM_OBJ_SIZE_MAX && (size&align_mask)==0) {
+  if (offset==0 && alignment<=padsize && padsize<=MI_MEDIUM_OBJ_SIZE_MAX && (padsize&align_mask)==0) {
    void* p = _mi_heap_malloc_zero(heap, size, zero);
    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
    return p;
@ -61,53 +63,53 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
 }


-mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
 }

-mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }

-mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
 }

-mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
 }

-mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
 }

-mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
 }

-mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }

-mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
 }

-mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
 }

-mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
 }

-mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
 }

-mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
 }

@ -150,55 +152,55 @@ static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsi
  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
 }

-mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
 }

-mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
 }

-mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
 }

-mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
 }

-mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
 }

-mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
 }

-mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }

-mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }

-mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
 }

-mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
 }

-mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
 }

-mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
 }

--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@ -17,6 +17,12 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ------------------------------------------------------
   Override system malloc on macOS
   This is done through the malloc zone interface.
+   It seems we also need to interpose (see `alloc-override.c`)
+   or otherwise we get zone errors as there are usually 
+   already allocations done by the time we take over the 
+   zone. Unfortunately, that means we need to replace
+   the `free` with a checked free (`cfree`) impacting 
+   performance.
 ------------------------------------------------------ */

 #include <AvailabilityMacros.h>
@ -35,34 +41,42 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im
 ------------------------------------------------------ */

 static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  UNUSED(zone); UNUSED(p);
  return 0; // as we cannot guarantee that `p` comes from us, just return 0
 }

 static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_malloc(size);
 }

 static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  UNUSED(zone);
  return mi_calloc(count, size);
 }

 static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_malloc_aligned(size, _mi_os_page_size());
 }

 static void zone_free(malloc_zone_t* zone, void* p) {
+  UNUSED(zone);
  return mi_free(p);
 }

 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  UNUSED(zone);
  return mi_realloc(p, newsize);
 }

 static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  UNUSED(zone);
  return mi_malloc_aligned(size,alignment);
 }

 static void zone_destroy(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo: ignore for now?
 }

@ -83,11 +97,13 @@ static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
 }

 static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone); UNUSED(size);
  mi_collect(false);
  return 0;
 }

 static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  UNUSED(size);
  zone_free(zone,p);
 }

@ -102,34 +118,43 @@ static kern_return_t intro_enumerator(task_t task, void* p,
                            vm_range_recorder_t recorder)
 {
  // todo: enumerate all memory
+  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
+  UNUSED(reader); UNUSED(recorder);
  return KERN_SUCCESS;
 }

 static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  UNUSED(zone);
  return mi_good_size(size);
 }

 static boolean_t intro_check(malloc_zone_t* zone) {
+  UNUSED(zone);
  return true;
 }

 static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  UNUSED(zone); UNUSED(verbose);
  mi_stats_print(NULL);
 }

 static void intro_log(malloc_zone_t* zone, void* p) {
+  UNUSED(zone); UNUSED(p);
  // todo?
 }

 static void intro_force_lock(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo?
 }

 static void intro_force_unlock(malloc_zone_t* zone) {
+  UNUSED(zone);
  // todo?
 }

 static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  UNUSED(zone);
  // todo...
  stats->blocks_in_use = 0;
  stats->size_in_use = 0;
@ -138,6 +163,7 @@ static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
 }

 static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  UNUSED(zone);
  return false;
 }

@ -161,7 +187,6 @@ static malloc_zone_t* mi_get_default_zone()
  }
 }

-
 static void __attribute__((constructor)) _mi_macos_override_malloc()
 {
  static malloc_introspection_t intro;
@ -201,6 +226,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
  zone.free_definite_size = &zone_free_definite_size;
  zone.pressure_relief = &zone_pressure_relief;
  intro.zone_locked = &intro_zone_locked;
+  intro.statistics = &intro_statistics;

  // force the purgeable zone to exist to avoid strange bugs
  if (malloc_default_purgeable_zone) {
@ -225,6 +251,7 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()
    malloc_zone_unregister(purgeable_zone);
    malloc_zone_register(purgeable_zone);
  }
+
 }

 #endif // MI_MALLOC_OVERRIDE
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif

-#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__MACH__) && !defined(MI_INTERPOSE)))

 // ------------------------------------------------------
 // Override system malloc
@ -47,26 +47,31 @@ terms of the MIT license. A copy of the license can be found in the file
    const void* replacement;
    const void* target;
  };
-  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
-  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
  {
    MI_INTERPOSE_MI(malloc),
    MI_INTERPOSE_MI(calloc),
    MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSE_MI(free),
    MI_INTERPOSE_MI(strdup),
-    MI_INTERPOSE_MI(strndup)
+    MI_INTERPOSE_MI(strndup),
+    MI_INTERPOSE_MI(realpath),
+    MI_INTERPOSE_MI(posix_memalign),
+    MI_INTERPOSE_MI(reallocf),
+    MI_INTERPOSE_MI(valloc),
+    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
  };
 #elif defined(_MSC_VER)
  // cannot override malloc unless using a dll.
  // we just override new/delete which does work in a static library.
 #else
  // On all other systems forward to our API
-  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size);
-  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n);
-  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize);
-  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p);
+  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size);
+  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n);
+  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize);
+  void  free(void* p)                    MI_FORWARD0(mi_free, p);
 #endif

 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
@ -94,8 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file
  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }

  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
-  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
+  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n);
  #endif

  #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
@ -194,4 +199,3 @@ int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_me
 #endif

 #endif // MI_MALLOC_OVERRIDE && !_WIN32
-
--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi prefixed publi definitions of various Posix, Unix, and C++ functions
 // for convenience and used when overriding these functions.
 // ------------------------------------------------------------------------
-
 #include "mimalloc.h"
 #include "mimalloc-internal.h"

@ -47,33 +46,38 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
  // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
  // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
  if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;      // natural alignment
+  if (alignment % sizeof(void*) != 0) return EINVAL;   // natural alignment
  if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
-  void* q = mi_malloc_aligned(size, alignment);
+  void* q = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
  if (q==NULL && size != 0) return ENOMEM;
+  mi_assert_internal(((uintptr_t)q % alignment) == 0);
  *p = q;
  return 0;
 }

-void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
-  return mi_malloc_aligned(size, alignment);
+mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+  void* p = (mi_malloc_satisfies_alignment(alignment,size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
 }

-void* mi_valloc(size_t size) mi_attr_noexcept {
-  return mi_malloc_aligned(size, _mi_os_page_size());
+mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept {
+  return mi_memalign( _mi_os_page_size(), size );
 }

-void* mi_pvalloc(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
  size_t psize = _mi_os_page_size();
  if (size >= SIZE_MAX - psize) return NULL; // overflow
-  size_t asize = ((size + psize - 1) / psize) * psize;
+  size_t asize = _mi_align_up(size, psize);
  return mi_malloc_aligned(asize, psize);
 }

-void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
  if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
  if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-  return mi_malloc_aligned(size, alignment);
+  void* p = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  mi_assert_internal(((uintptr_t)p % alignment) == 0);
+  return p;
 }

 void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
@ -88,7 +92,7 @@ void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
  return res;
 }

-unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
  if (s==NULL) return NULL;
  size_t len;
  for(len = 0; s[len] != 0; len++) { }
@ -100,7 +104,7 @@ unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
  return p;
 }

-unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
  return (unsigned char*)mi_strdup((const char*)s);
 }

--- a/src/alloc.c
+++ b/src/alloc.c
@ -21,92 +21,120 @@ terms of the MIT license. A copy of the license can be found in the file

 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { 
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
  mi_block_t* block = page->free;
  if (mi_unlikely(block == NULL)) {
-    return _mi_malloc_generic(heap, size); // slow path
+    return _mi_malloc_generic(heap, size); 
  }
  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
  // pop from the free list
-  page->free = mi_block_next(page,block);
+  page->free = mi_block_next(page, block);
  page->used++;
  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
-#if (MI_DEBUG!=0)
+#if (MI_DEBUG>0)
  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
 #elif (MI_SECURE!=0)
  block->next = 0;  // don't leak internal data
 #endif
 #if (MI_STAT>1)
-  if(size <= MI_LARGE_OBJ_SIZE_MAX) {
-    size_t bin = _mi_bin(size);
-    mi_heap_stat_increase(heap,normal[bin], 1);
+  const size_t bsize = mi_page_usable_block_size(page);
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    const size_t bin = _mi_bin(bsize);
+    mi_heap_stat_increase(heap, normal[bin], 1);
  }
+#endif
+#if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
+  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+  padding->delta  = (uint32_t)(delta);
+  uint8_t* fill = (uint8_t*)padding - delta;
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
 #endif
  return block;
 }

 // allocate a small block
-extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
-  mi_assert(size <= MI_SMALL_SIZE_MAX);
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
-  return _mi_page_malloc(heap, page, size);
-}
-
-extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
-  return mi_heap_malloc_small(mi_get_default_heap(), size);
-}
-
-
-// zero initialized small block
-mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
-  void* p = mi_malloc_small(size);
-  if (p != NULL) { memset(p, 0, size); }
-  return p;
-}
-
-// The main allocation function
-extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
  mi_assert(heap!=NULL);
  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-  void* p;
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
-    p = mi_heap_malloc_small(heap, size);
-  }
-  else {
-    p = _mi_malloc_generic(heap, size);
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+  #if (MI_PADDING)
+  if (size == 0) {
+    size = sizeof(void*);
  }
+  #endif
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
+  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE);
+  mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
  #if MI_STAT>1
  if (p != NULL) {
    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
-    mi_heap_stat_increase( heap, malloc, mi_good_size(size) );  // overestimate for aligned sizes
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
  }
  #endif
  return p;
 }

-extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+// The main allocation function
+extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    mi_assert(heap!=NULL);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE);      // note: size can overflow but it is detected in malloc_generic
+    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    }
+    #endif
+    return p;
+  }
+}
+
+extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
  return mi_heap_malloc(mi_get_default_heap(), size);
 }

+
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
-  // note: we need to initialize the whole block to zero, not just size
+  // note: we need to initialize the whole usable block size to zero, not just the requested size,
  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  UNUSED_RELEASE(size);
+  UNUSED(size);
  mi_assert_internal(p != NULL);
-  mi_assert_internal(mi_page_block_size(page) >= size); // size can be zero
+  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
  mi_assert_internal(_mi_ptr_page(p)==page);
-  if (page->is_zero) {
-    // already zero initialized memory?
+  if (page->is_zero && size > sizeof(mi_block_t)) {
+    // already zero initialized memory
    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
-    mi_assert_expensive(mi_mem_is_zero(p, mi_page_block_size(page)));
+    mi_assert_expensive(mi_mem_is_zero(p, mi_usable_size(p)));
  }
  else {
    // otherwise memset
-    memset(p, 0, mi_page_block_size(page));
+    memset(p, 0, mi_usable_size(p));
  }
 }

+// zero initialized small block
+mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_small(size);
+  if (p != NULL) {
+    _mi_block_zero_init(_mi_ptr_page(p), p, size);  // todo: can we avoid getting the page again?
+  }
+  return p;
+}
+
 void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
  void* p = mi_heap_malloc(heap,size);
  if (zero && p != NULL) {
@ -115,11 +143,11 @@ void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
  return p;
 }

-extern inline mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
  return _mi_heap_malloc_zero(heap, size, true);
 }

-mi_decl_allocator void* mi_zalloc(size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
  return mi_heap_zalloc(mi_get_default_heap(),size);
 }

@ -153,7 +181,7 @@ static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, con
 }

 static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  mi_block_t* n = mi_block_nextx(page, block, page->key[0], page->key[1]); // pretend it is freed, and get the decoded first field
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
  {
@ -171,6 +199,88 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 }
 #endif

+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  *delta = padding->delta;
+  return ((uint32_t)mi_ptr_encode(page,block,page->keys) == padding->canary && *delta <= *bsize);
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  uint8_t* fill = (uint8_t*)block + bsize - delta;
+  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+  for (size_t i = 0; i < maxpad; i++) {
+    if (fill[i] != MI_DEBUG_PADDING) {
+      *wrong = bsize - delta + i;
+      return false;
+    }
+  }
+  return true;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  padding->delta = (uint32_t)new_delta;
+}
+#else
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(page);
+  UNUSED(block);
+}
+
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  UNUSED(page);
+  UNUSED(block);
+  UNUSED(min_size);
+}
+#endif

 // ------------------------------------------------------
 // Free
@ -208,6 +318,14 @@ static mi_decl_noinline void mi_free_huge_block_mt(mi_segment_t* segment, mi_pag
 // multi-threaded free
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  mi_padding_shrink(page, block, sizeof(mi_block_t)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  #if (MI_DEBUG!=0)
+  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  #endif
+
  // huge page segments are always abandoned and can be freed immediately
  mi_segment_t* segment = _mi_page_segment(page);
  if (segment->kind==MI_SEGMENT_HUGE) {
@ -215,6 +333,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
    return;
  }

+  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
  mi_thread_free_t tfree;
  mi_thread_free_t tfreex;
  bool use_delayed;
@ -234,14 +353,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc

  if (mi_unlikely(use_delayed)) {
    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* heap = mi_page_heap(page);
+    mi_heap_t* const heap = mi_page_heap(page);
    mi_assert_internal(heap != NULL);
    if (heap != NULL) {
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
      mi_block_t* dfree;
      do {
        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-        mi_block_set_nextx(heap,block,dfree, heap->key[0], heap->key[1]);
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
    }

@ -258,14 +377,14 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 // regular free
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
-  #if (MI_DEBUG)
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  #endif
-
  // and push it on the free list
  if (mi_likely(local)) {
    // owning thread can free a block directly
    if (mi_unlikely(mi_check_is_double_free(page, block))) return;
+    mi_check_padding(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
    mi_block_set_next(page, block, page->local_free);
    page->local_free = block;
    page->used--;
@ -285,15 +404,15 @@ static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
  mi_assert_internal(page!=NULL && p!=NULL);
-  size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  size_t adjust = (diff % mi_page_block_size(page));
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
  return (mi_block_t*)((uintptr_t)p - adjust);
 }


 static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) {
-  mi_page_t* page = _mi_segment_page_of(segment, p);
-  mi_block_t* block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
  _mi_free_block(page, local, block);
 }

@ -316,7 +435,7 @@ void mi_free(void* p) mi_attr_noexcept
      "(this may still be a valid very large allocation (over 64MiB))\n", p);
    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+    } 
  }
 #endif
 #if (MI_DEBUG!=0 || MI_SECURE>=4)
@ -328,20 +447,24 @@ void mi_free(void* p) mi_attr_noexcept

  const uintptr_t tid = _mi_thread_id();
  mi_page_t* const page = _mi_segment_page_of(segment, p);
-  
+
 #if (MI_STAT>1)
-  mi_heap_t* heap = mi_heap_get_default();
-  mi_heap_stat_decrease(heap, malloc, mi_usable_size(p));
-  if (page->xblock_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal[_mi_bin(page->xblock_size)], 1);
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+  mi_heap_stat_decrease(heap, malloc, bsize);
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
+    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
  }
-  // huge page stat is accounted for in `_mi_page_retire`
 #endif

  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
    // local, and not full or aligned
-    mi_block_t* const block = (mi_block_t*)p;
+    mi_block_t* block = (mi_block_t*)(p);
    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
+    mi_check_padding(page, block);
+    #if (MI_DEBUG!=0)
+    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    #endif
    mi_block_set_next(page, block, page->local_free);
    page->local_free = block;
    page->used--;
@ -358,10 +481,10 @@ void mi_free(void* p) mi_attr_noexcept

 bool _mi_free_delayed_block(mi_block_t* block) {
  // get segment and page
-  const mi_segment_t* segment = _mi_ptr_segment(block);
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* page = _mi_segment_page_of(segment, block);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);

  // Clear the no-delayed flag so delayed freeing is used again for this page.
  // This must be done before collecting the free lists on this page -- otherwise
@ -381,11 +504,12 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 // Bytes available in a block
 size_t mi_usable_size(const void* p) mi_attr_noexcept {
  if (p==NULL) return 0;
-  const mi_segment_t* segment = _mi_ptr_segment(p);
-  const mi_page_t* page = _mi_segment_page_of(segment,p);
-  size_t size = mi_page_block_size(page);
+  const mi_segment_t* const segment = _mi_ptr_segment(p);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const mi_block_t* const block = (const mi_block_t*)p;
+  const size_t size = mi_page_usable_size_of(page, block);
  if (mi_unlikely(mi_page_has_aligned(page))) {
-    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
+    ptrdiff_t const adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
    return (size - adjust);
  }
@ -433,29 +557,29 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
  mi_free(p);
 }

-extern inline mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(count,size,&total)) return NULL;
  return mi_heap_zalloc(heap,total);
 }

-mi_decl_allocator void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
  return mi_heap_calloc(mi_get_default_heap(),count,size);
 }

 // Uninitialized `calloc`
-extern mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_malloc(heap, total);
 }

-mi_decl_allocator void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
  return mi_heap_mallocn(mi_get_default_heap(),count,size);
 }

 // Expand in place or fail
-mi_decl_allocator void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
  if (p == NULL) return NULL;
  size_t size = mi_usable_size(p);
  if (newsize > size) return NULL;
@ -481,11 +605,11 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
  return newp;
 }

-mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
  return _mi_heap_realloc_zero(heap, p, newsize, false);
 }

-mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_realloc(heap, p, total);
@ -493,41 +617,41 @@ mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count,


 // Reallocate but free `p` on errors
-mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
  void* newp = mi_heap_realloc(heap, p, newsize);
  if (newp==NULL && p!=NULL) mi_free(p);
  return newp;
 }

-mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
  return _mi_heap_realloc_zero(heap, p, newsize, true);
 }

-mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
  size_t total;
  if (mi_count_size_overflow(count, size, &total)) return NULL;
  return mi_heap_rezalloc(heap, p, total);
 }


-mi_decl_allocator void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
 }

-mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
 }

 // Reallocate but free `p` on errors
-mi_decl_allocator void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
 }

-mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
 }

-mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
 }

@ -538,7 +662,7 @@ mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_
 // ------------------------------------------------------

 // `strdup` using mi_malloc
-char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
  if (s == NULL) return NULL;
  size_t n = strlen(s);
  char* t = (char*)mi_heap_malloc(heap,n+1);
@ -546,12 +670,12 @@ char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
  return t;
 }

-char* mi_strdup(const char* s) mi_attr_noexcept {
+mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
  return mi_heap_strdup(mi_get_default_heap(), s);
 }

 // `strndup` using mi_malloc
-char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
  if (s == NULL) return NULL;
  size_t m = strlen(s);
  if (n > m) n = m;
@ -562,7 +686,7 @@ char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept
  return t;
 }

-char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
  return mi_heap_strndup(mi_get_default_heap(),s,n);
 }

@ -573,7 +697,7 @@ char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #define PATH_MAX MAX_PATH
 #endif
 #include <windows.h>
-char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
  // todo: use GetFullPathNameW to allow longer file names
  char buf[PATH_MAX];
  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
@ -619,7 +743,7 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name)
 }
 #endif

-char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
 }
 #endif
@ -684,19 +808,19 @@ static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow ) {
  return p;
 }

-void* mi_new(size_t size) {
+mi_decl_restrict void* mi_new(size_t size) {
  void* p = mi_malloc(size);
  if (mi_unlikely(p == NULL)) return mi_try_new(size,false);
  return p;
 }

-void* mi_new_nothrow(size_t size) {
+mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
  void* p = mi_malloc(size);
  if (mi_unlikely(p == NULL)) return mi_try_new(size, true);
  return p;
 }

-void* mi_new_aligned(size_t size, size_t alignment) {
+mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
  void* p;
  do {
    p = mi_malloc_aligned(size, alignment);
@ -705,7 +829,7 @@ void* mi_new_aligned(size_t size, size_t alignment) {
  return p;
 }

-void* mi_new_aligned_nothrow(size_t size, size_t alignment) {
+mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
  void* p;
  do {
    p = mi_malloc_aligned(size, alignment);
@ -714,7 +838,7 @@ void* mi_new_aligned_nothrow(size_t size, size_t alignment) {
  return p;
 }

-void* mi_new_n(size_t count, size_t size) {
+mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
  size_t total;
  if (mi_unlikely(mi_count_size_overflow(count, size, &total))) {
    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
--- a/src/arena.c
+++ b/src/arena.c
@ -448,7 +448,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
    return ENOMEM;
  }
-  _mi_verbose_message("reserved %zu gb huge pages on numa node %i (of the %zu gb requested)\n", pages_reserved, numa_node, pages);
+  _mi_verbose_message("numa node %i: reserved %zu gb huge pages (of the %zu gb requested)\n", numa_node, pages_reserved, pages);

  size_t bcount = mi_block_count_of_size(hsize);
  size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
--- a/src/heap.c
+++ b/src/heap.c
@ -138,6 +138,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
  // (if abandoning, after this there are no more thread-delayed references into the pages.)
  _mi_heap_delayed_free(heap);

+  // collect retired pages
+  _mi_heap_collect_retired(heap, collect >= MI_FORCE);
+
  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
  mi_assert_internal( collect != MI_ABANDON || mi_atomic_read_ptr(mi_block_t,&heap->thread_delayed_free) == NULL );
@ -188,16 +191,19 @@ mi_heap_t* mi_heap_get_backing(void) {

 mi_heap_t* mi_heap_new(void) {
  mi_heap_t* bheap = mi_heap_get_backing();
-  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
  if (heap==NULL) return NULL;
  memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t));
  heap->tld = bheap->tld;
  heap->thread_id = _mi_thread_id();
  _mi_random_split(&bheap->random, &heap->random);
-  heap->cookie = _mi_heap_random_next(heap) | 1;
-  heap->key[0] = _mi_heap_random_next(heap);
-  heap->key[1] = _mi_heap_random_next(heap);
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->keys[0] = _mi_heap_random_next(heap);
+  heap->keys[1] = _mi_heap_random_next(heap);
  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
  return heap;
 }

@ -220,6 +226,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {

 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
 static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  if (mi_heap_is_backing(heap)) return; // dont free the backing heap

@ -227,6 +234,22 @@ static void mi_heap_free(mi_heap_t* heap) {
  if (mi_heap_is_default(heap)) {
    _mi_heap_set_default_direct(heap->tld->heap_backing);
  }
+
+  // remove ourselves from the thread local heaps list
+  // linear search but we expect the number of heaps to be relatively small
+  mi_heap_t* prev = NULL;
+  mi_heap_t* curr = heap->tld->heaps; 
+  while (curr != heap && curr != NULL) {
+    prev = curr;
+    curr = curr->next;
+  }
+  mi_assert_internal(curr == heap);
+  if (curr == heap) {
+    if (prev != NULL) { prev->next = heap->next; }
+                 else { heap->tld->heaps = heap->next; }
+  }
+  mi_assert_internal(heap->tld->heaps != NULL);
+
  // and free the used memory
  mi_free(heap);
 }
@ -283,6 +306,7 @@ void _mi_heap_destroy_pages(mi_heap_t* heap) {
 }

 void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
  mi_assert(heap->no_reclaim);
  mi_assert_expensive(mi_heap_is_valid(heap));
@ -309,38 +333,37 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
  mi_assert_internal(heap!=NULL);
  if (from==NULL || from->page_count == 0) return;

-  // unfull all full pages in the `from` heap
-  mi_page_t* page = from->pages[MI_BIN_FULL].first;
-  while (page != NULL) {
-    mi_page_t* next = page->next;
-    _mi_page_unfull(page);
-    page = next;
-  }
-  mi_assert_internal(from->pages[MI_BIN_FULL].first == NULL);
-
-  // free outstanding thread delayed free blocks
+  // reduce the size of the delayed frees
  _mi_heap_delayed_free(from);
-
-  // transfer all pages by appending the queues; this will set
-  // a new heap field which is ok as all pages are unfull'd and thus
-  // other threads won't access this field anymore (see `mi_free_block_mt`)
-  for (size_t i = 0; i < MI_BIN_FULL; i++) {
+  
+  // transfer all pages by appending the queues; this will set a new heap field 
+  // so threads may do delayed frees in either heap for a while.
+  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+  // so after this only the new heap will get delayed frees
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
    mi_page_queue_t* pq = &heap->pages[i];
    mi_page_queue_t* append = &from->pages[i];
    size_t pcount = _mi_page_queue_append(heap, pq, append);
    heap->page_count += pcount;
    from->page_count -= pcount;
  }
-  mi_assert_internal(from->thread_delayed_free == NULL);
  mi_assert_internal(from->page_count == 0);

+  // and do outstanding delayed frees in the `from` heap  
+  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
+  // the regular `_mi_free_delayed_block` which is safe.
+  _mi_heap_delayed_free(from);  
+  mi_assert_internal(from->thread_delayed_free == NULL);
+
  // and reset the `from` heap
-  mi_heap_reset_pages(from);
+  mi_heap_reset_pages(from);  
 }

 // Safe delete a heap without freeing any still allocated blocks in that heap.
 void mi_heap_delete(mi_heap_t* heap)
 {
+  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (!mi_heap_is_initialized(heap)) return;
--- a/src/init.c
+++ b/src/init.c
@ -34,8 +34,14 @@ const mi_page_t _mi_page_empty = {
 };

 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
-#define MI_SMALL_PAGES_EMPTY  \
-  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+
+#if defined(MI_PADDING) && (MI_INTPTR_SIZE >= 8)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#elif defined(MI_PADDING)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#else
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
+#endif


 // Empty page queues for every bin
@ -106,6 +112,8 @@ const mi_heap_t _mi_heap_empty = {
  { 0, 0 },         // keys
  { {0}, {0}, 0 },
  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next
  false
 };

@ -115,7 +123,7 @@ const mi_heap_t _mi_heap_empty = {
 static const mi_tld_t tld_empty = {
  0,
  false,
-  NULL,
+  NULL, NULL,
  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_empty_stats, tld_empty_os }, // segments
  { 0, tld_empty_stats }, // os
  { MI_STATS_NULL }       // stats
@ -129,30 +137,28 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
 #define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))

+extern mi_heap_t _mi_heap_main;
+
 static mi_tld_t tld_main = {
  0, false,
-  &_mi_heap_main,
+  &_mi_heap_main, & _mi_heap_main,
  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
  { 0, tld_main_stats },  // os
  { MI_STATS_NULL }       // stats
 };

-#if MI_INTPTR_SIZE==8
-#define MI_INIT_COOKIE  (0xCDCDCDCDCDCDCDCDUL)
-#else
-#define MI_INIT_COOKIE  (0xCDCDCDCDUL)
-#endif
-
 mi_heap_t _mi_heap_main = {
  &tld_main,
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY,
  ATOMIC_VAR_INIT(NULL),
  0,                // thread id
-  MI_INIT_COOKIE,   // initial cookie
-  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0}, {0}, 0 },  // random
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next heap
  false             // can reclaim
 };

@ -161,6 +167,22 @@ bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 mi_stats_t _mi_stats_main = { MI_STATS_NULL };


+static void mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
+
 /* -----------------------------------------------------------
  Initialization and freeing of the thread local heaps
 ----------------------------------------------------------- */
@ -173,14 +195,16 @@ typedef struct mi_thread_data_s {

 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
    // the main heap is statically allocated
+    mi_heap_main_init();
    _mi_heap_set_default_direct(&_mi_heap_main);
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS    
+    // use `_mi_os_alloc` to allocate directly from the OS
    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
    if (td == NULL) {
      _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
@ -193,11 +217,12 @@ static bool _mi_heap_init(void) {
    memcpy(heap, &_mi_heap_empty, sizeof(*heap));
    heap->thread_id = _mi_thread_id();
    _mi_random_init(&heap->random);
-    heap->cookie = _mi_heap_random_next(heap) | 1;
-    heap->key[0] = _mi_heap_random_next(heap);
-    heap->key[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->cookie  = _mi_heap_random_next(heap) | 1;
+    heap->keys[0] = _mi_heap_random_next(heap);
+    heap->keys[1] = _mi_heap_random_next(heap);
+    heap->tld = tld;
    tld->heap_backing = heap;
+    tld->heaps = heap;
    tld->segments.stats = &tld->stats;
    tld->segments.os = &tld->os;
    tld->os.stats = &tld->stats;
@ -213,12 +238,24 @@ static bool _mi_heap_done(mi_heap_t* heap) {
  // reset default heap
  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);

-  // todo: delete all non-backing heaps?
-
-  // switch to backing heap and free it
+  // switch to backing heap
  heap = heap->tld->heap_backing;
  if (!mi_heap_is_initialized(heap)) return false;

+
+  // delete all non-backing heaps in this thread
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
+    if (curr != heap) {
+      mi_assert_internal(!mi_heap_is_backing(curr));
+      mi_heap_delete(curr);
+    }
+    curr = next;
+  }
+  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
+  mi_assert_internal(mi_heap_is_backing(heap));
+
  // collect if not the main thread
  if (heap != &_mi_heap_main) {
    _mi_heap_collect_abandon(heap);
@ -232,7 +269,9 @@ static bool _mi_heap_done(mi_heap_t* heap) {
    mi_assert_internal(heap->tld->segments.count == 0);
    _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
  }
-#if (MI_DEBUG > 0)
+#if 0  
+  // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+  // there may still be delete/free calls after the mi_fls_done is called. Issue #207
  else {
    _mi_heap_destroy_pages(heap);
    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
@ -273,14 +312,15 @@ static void _mi_thread_done(mi_heap_t* default_heap);
  // use thread local storage keys to detect thread ending
  #include <windows.h>
  #include <fibersapi.h>
-  static DWORD mi_fls_key;
+  static DWORD mi_fls_key = (DWORD)(-1);
  static void NTAPI mi_fls_done(PVOID value) {
    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
 #elif defined(MI_USE_PTHREADS)
-  // use pthread locol storage keys to detect thread ending
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
  #include <pthread.h>
-  static pthread_key_t mi_pthread_key;
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
  static void mi_pthread_done(void* value) {
    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
  }
@ -300,8 +340,10 @@ static void mi_process_setup_auto_thread_done(void) {
  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
    mi_fls_key = FlsAlloc(&mi_fls_done);
  #elif defined(MI_USE_PTHREADS)
-    pthread_key_create(&mi_pthread_key, &mi_pthread_done);
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
  #endif
+  _mi_heap_set_default_direct(&_mi_heap_main);
 }


@ -343,21 +385,31 @@ static void _mi_thread_done(mi_heap_t* heap) {

 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
  mi_assert_internal(heap != NULL);
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
+  // we use _mi_heap_default_key
+  #else
  _mi_heap_default = heap;
+  #endif

  // ensure the default heap is passed to `_mi_thread_done`
  // setting to a non-NULL value also ensures `mi_thread_done` is called.
  #if defined(_WIN32) && defined(MI_SHARED_LIB)
    // nothing to do as it is done in DllMain
  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_assert_internal(mi_fls_key != 0);
    FlsSetValue(mi_fls_key, heap);
  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, heap);
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
  #endif
 }


-
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
 // --------------------------------------------------------
@ -409,11 +461,16 @@ static void mi_allocator_done() {

 // Called once by the process loader
 static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  UNUSED(dummy);
+  #endif
  os_preloading = false;
  atexit(&mi_process_done);
  _mi_options_init();
  mi_process_init();
-  //mi_stats_reset();
+  //mi_stats_reset();-
  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");

  // show message from the redirector (if present)
@ -428,22 +485,12 @@ static void mi_process_load(void) {
 void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  if (_mi_process_is_initialized) return;
-  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
-  // that the TLS slot is allocated without getting into recursion on macOS
-  // when using dynamic linking with interpose.
-  mi_get_default_heap();
  _mi_process_is_initialized = true;
-
-  _mi_heap_main.thread_id = _mi_thread_id();
-  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
-  _mi_random_init(&_mi_heap_main.random);
-  #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
-  #endif
  mi_process_setup_auto_thread_done();
+
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
  _mi_os_init();
+  mi_heap_main_init();
  #if (MI_DEBUG)
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
@ -466,6 +513,10 @@ static void mi_process_done(void) {
  if (process_done) return;
  process_done = true;

+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  FlsSetValue(mi_fls_key, NULL);  // don't call main-thread callback
+  FlsFree(mi_fls_key);            // call thread-done on all threads to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  #endif
  #ifndef NDEBUG
  mi_collect(true);
  #endif
@ -473,7 +524,7 @@ static void mi_process_done(void) {
      mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
-  mi_allocator_done();
+  mi_allocator_done();  
  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
  os_preloading = true; // don't call the C runtime anymore
 }
--- a/src/options.c
+++ b/src/options.c
@ -70,7 +70,11 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
  { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+#if defined(__NetBSD__)
  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#else
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+#endif
  { 1, UNINIT, MI_OPTION(allow_decommit) },      // decommit pages when not eager committed
  { 100,  UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
  { 1000, UNINIT, MI_OPTION(arena_reset_delay) }, // reset delay in milli-seconds
@ -87,7 +91,7 @@ void _mi_options_init(void) {
  mi_add_stderr_output(); // now it safe to use stderr for output
  for(int i = 0; i < _mi_option_last; i++ ) {
    mi_option_t option = (mi_option_t)i;
-    mi_option_get(option); // initialize
+    long l = mi_option_get(option); UNUSED(l); // initialize
    if (option != mi_option_verbose) {
      mi_option_desc_t* desc = &options[option];
      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
@ -241,16 +245,30 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;

+static bool mi_recurse_enter(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return true;
+  #endif
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static void mi_recurse_exit(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return;
+  #endif
+  recurse = false;
+}
+
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (recurse) return;
+  if (!mi_recurse_enter()) return;
  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
    out = mi_out_get_default(&arg);
  }
-  recurse = true;
  if (prefix != NULL) out(prefix,arg);
  out(message,arg);
-  recurse = false;
-  return;
+  mi_recurse_exit();
 }

 // Define our own limited `fprintf` that avoids memory allocation.
@ -258,14 +276,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
  char buf[512];
  if (fmt==NULL) return;
-  if (recurse) return;
-  recurse = true;
+  if (!mi_recurse_enter()) return;
  vsnprintf(buf,sizeof(buf)-1,fmt,args);
-  recurse = false;
+  mi_recurse_exit();
  _mi_fputs(out,arg,prefix,buf);
 }

-
 void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
  va_list args;
  va_start(args,fmt);
@ -292,7 +308,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 static void mi_show_error_message(const char* fmt, va_list args) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }

 void _mi_warning_message(const char* fmt, ...) {
@ -321,6 +337,14 @@ static volatile _Atomic(void*) mi_error_arg;     // = NULL

 static void mi_error_default(int err) {
  UNUSED(err);
+#if (MI_DEBUG>0) 
+  if (err==EFAULT) {
+    #ifdef _MSC_VER
+    __debugbreak();
+    #endif
+    abort();
+  }
+#endif
 #if (MI_SECURE>0)
  if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
    abort();
--- a/src/os.c
+++ b/src/os.c
@ -188,7 +188,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
  if (was_committed) _mi_stat_decrease(&stats->committed, size);
  _mi_stat_decrease(&stats->reserved, size);
  if (err) {
-#pragma warning(suppress:4996)
+    #pragma warning(suppress:4996)
    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
    return false;
  }
@ -208,15 +208,20 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
  void* hint;
  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
-    return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+    void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+    if (p != NULL) return p;
+    DWORD err = GetLastError();
+    if (err != ERROR_INVALID_ADDRESS) { // if linked with multiple instances, we may have tried to allocate at an already allocated area
+      return NULL;
+    }
  }
 #endif
 #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
  // on modern Windows try use VirtualAlloc2 for aligned allocation
  if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
-    MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
+    MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
-    MEM_EXTENDED_PARAMETER param = { 0 };
+    MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
    param.Type = MemExtendedParameterAddressRequirements;
    param.Pointer = &reqs;
    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
@ -284,6 +289,7 @@ static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int pr
  }
  #else
  UNUSED(try_alignment);
+  UNUSED(mi_os_get_aligned_hint);
  #endif
  if (p==NULL) {
    p = mmap(addr,size,protect_flags,flags,fd,0);
@ -826,7 +832,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
  mi_win_enable_large_os_pages();

  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
+  MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
  static bool mi_huge_pages_available = true;
  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
@ -850,7 +856,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
    else {
      // fall back to regular large pages
      mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+      _mi_warning_message("unable to allocate using huge (1gb) pages, trying large (2mb) pages instead (status 0x%lx)\n", err);
    }
  }
  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
@ -891,7 +897,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
    // see: <https://lkml.org/lkml/2017/2/9/875>
    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
    if (err != 0) {
-      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
+      _mi_warning_message("failed to bind huge (1gb) pages to numa node %d: %s\n", numa_node, strerror(errno));
    }
  }
  return p;
@ -1075,4 +1081,3 @@ int _mi_os_numa_node_get(mi_os_tld_t* tld) {
  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
  return (int)numa_node;
 }
-
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -332,6 +332,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
  mi_page_set_in_full(page, mi_page_queue_is_full(to));
 }

+// Only called from `mi_heap_absorb`.
 size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
  mi_assert_internal(mi_heap_contains_queue(heap,pq));
  mi_assert_internal(pq->block_size == append->block_size);
@ -341,7 +342,13 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  // set append pages to new heap and count
  size_t count = 0;
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
-    mi_page_set_heap(page,heap);
+    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
+    mi_atomic_write(&page->xheap, (uintptr_t)heap); 
+    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
+    // that after appending only the new heap will be used for delayed free operations.
+    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
    count++;
  }

--- a/src/page.c
+++ b/src/page.c
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* -----------------------------------------------------------
  The core of the allocator. Every segment contains
-  pages of a certain block size. The main function
+  pages of a {certain block size. The main function
  exported is `mi_malloc_generic`.
 ----------------------------------------------------------- */

@ -105,7 +105,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
 bool _mi_page_is_valid(mi_page_t* page) {
  mi_assert_internal(mi_page_is_valid_init(page));
  #if MI_SECURE
-  mi_assert_internal(page->key != 0);
+  mi_assert_internal(page->keys[0] != 0);
  #endif
  if (mi_page_heap(page)!=NULL) {
    mi_segment_t* segment = _mi_page_segment(page);
@ -281,7 +281,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {

  // and free them all
  while(block != NULL) {
-    mi_block_t* next = mi_block_nextx(heap,block, heap->key[0], heap->key[1]);
+    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
    // use internal free instead of regular one to keep stats etc correct
    if (!_mi_free_delayed_block(block)) {
      // we might already start delayed freeing while another thread has not yet
@ -289,7 +289,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
      mi_block_t* dfree;
      do {
        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-        mi_block_set_nextx(heap, block, dfree, heap->key[0], heap->key[1]);
+        mi_block_set_nextx(heap, block, dfree, heap->keys);
      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
    }
    block = next;
@ -348,7 +348,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {

 #if MI_DEBUG>1
  // check there are no references left..
-  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->key[0], pheap->key[1])) {
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
    mi_assert_internal(_mi_ptr_page(block) != page);
  }
 #endif
@ -393,7 +393,8 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
  _mi_segment_page_free(page, force, segments_tld);
 }

-#define MI_MAX_RETIRE_SIZE    (4*MI_SMALL_SIZE_MAX)
+#define MI_MAX_RETIRE_SIZE    (MI_MEDIUM_OBJ_SIZE_MAX)
+#define MI_RETIRE_CYCLES      (16)

 // Retire a page with no more used blocks
 // Important to not retire too quickly though as new
@ -418,7 +419,13 @@ void _mi_page_retire(mi_page_t* page) {
  if (mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_is_in_full(page))) {
    if (pq->last==page && pq->first==page) { // the only page in the queue?
      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 16;
+      page->retire_expire = MI_RETIRE_CYCLES;
+      mi_heap_t* heap = mi_page_heap(page);
+      mi_assert_internal(pq >= heap->pages);
+      const size_t index = pq - heap->pages;
+      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
+      if (index < heap->page_retired_min) heap->page_retired_min = index;
+      if (index > heap->page_retired_max) heap->page_retired_max = index;
      mi_assert_internal(mi_page_all_free(page));
      return; // dont't free after all
    }
@ -427,22 +434,32 @@ void _mi_page_retire(mi_page_t* page) {
 }

 // free retired pages: we don't need to look at the entire queues
-// since we only retire pages that are the last one in a queue.
+// since we only retire pages that are at the head position in a queue.
 void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
-  for(mi_page_queue_t* pq = heap->pages; pq->block_size <= MI_MAX_RETIRE_SIZE; pq++) {
-    mi_page_t* page = pq->first;
+  size_t min = MI_BIN_FULL;
+  size_t max = 0;
+  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &heap->pages[bin];
+    mi_page_t*       page = pq->first;
    if (page != NULL && page->retire_expire != 0) {
      if (mi_page_all_free(page)) {
        page->retire_expire--;
        if (force || page->retire_expire == 0) {
          _mi_page_free(pq->first, pq, force);
        }
+        else {
+          // keep retired, update min/max
+          if (bin < min) min = bin;
+          if (bin > max) max = bin;
+        }
      }
      else {
        page->retire_expire = 0;
      }
    }
  }
+  heap->page_retired_min = min;
+  heap->page_retired_max = max;
 }


@ -618,8 +635,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page_size / block_size < (1L<<16));
  page->reserved = (uint16_t)(page_size / block_size);
  #ifdef MI_ENCODE_FREELIST
-  page->key[0] = _mi_heap_random_next(heap);
-  page->key[1] = _mi_heap_random_next(heap);
+  page->keys[0] = _mi_heap_random_next(heap);
+  page->keys[1] = _mi_heap_random_next(heap);
  #endif
  page->is_zero = page->is_zero_init;

@ -634,8 +651,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(page->retire_expire == 0);
  mi_assert_internal(!mi_page_has_aligned(page));
  #if (MI_ENCODE_FREELIST)
-  mi_assert_internal(page->key[0] != 0);
-  mi_assert_internal(page->key[1] != 0);
+  mi_assert_internal(page->keys[0] != 0);
+  mi_assert_internal(page->keys[1] != 0);
  #endif
  mi_assert_expensive(mi_page_is_valid_init(page));

@ -760,13 +777,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // just that page, we always treat them as abandoned and any thread
 // that frees the block can free the whole page and segment directly.
 static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {
-  size_t block_size = _mi_wsize_from_size(size) * sizeof(uintptr_t);
+  size_t block_size = _mi_os_good_alloc_size(size);
  mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);
  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX);
  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size);
  if (page != NULL) {
-    const size_t bsize = mi_page_block_size(page);
+    const size_t bsize = mi_page_usable_block_size(page);
    mi_assert_internal(mi_page_immediate_available(page));
    mi_assert_internal(bsize >= size);

@ -794,6 +811,7 @@ static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size) {


 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
 void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
 {
  mi_assert_internal(heap != NULL);
@ -813,9 +831,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept

  // huge allocation?
  mi_page_t* page;
-  if (mi_unlikely(size > MI_MEDIUM_OBJ_SIZE_MAX)) {
-    if (mi_unlikely(size > PTRDIFF_MAX)) {
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", size);
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+  if (mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE))) {
+    if (mi_unlikely(req_size > PTRDIFF_MAX)) { // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", req_size);
      return NULL;
    }
    else {
@ -824,6 +843,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
  }
  else {
    // otherwise find a page with free blocks in our size segregated queues
+    mi_assert_internal(size >= MI_PADDING_SIZE);
    page = mi_find_free_page(heap,size);
  }
  if (mi_unlikely(page == NULL)) { // out of memory
--- a/src/random.c
+++ b/src/random.c
@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
-and to avoid implementations that use a lock. We only use the OS provided 
+and to avoid implementations that use a lock. We only use the OS provided
 random source to initialize the initial seeds. Since we do not need ultimate
 performance but we do rely on the security (for secret cookies in secure mode)
 we use a cryptographically secure generator (chacha20).
@ -21,11 +21,11 @@ we use a cryptographically secure generator (chacha20).


 /* ----------------------------------------------------------------------------
-Chacha20 implementation as the original algorithm with a 64-bit nonce 
+Chacha20 implementation as the original algorithm with a 64-bit nonce
 and counter: https://en.wikipedia.org/wiki/Salsa20
 The input matrix has sixteen 32-bit values:
 Position  0 to  3: constant key
-Position  4 to 11: the key 
+Position  4 to 11: the key
 Position 12 to 13: the counter.
 Position 14 to 15: the nonce.

@ -44,8 +44,8 @@ static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d
  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
 }

-static void chacha_block(mi_random_ctx_t* ctx) 
-{  
+static void chacha_block(mi_random_ctx_t* ctx)
+{
  // scramble into `x`
  uint32_t x[16];
  for (size_t i = 0; i < 16; i++) {
@ -72,8 +72,8 @@ static void chacha_block(mi_random_ctx_t* ctx)
  ctx->input[12] += 1;
  if (ctx->input[12] == 0) {
    ctx->input[13] += 1;
-    if (ctx->input[13] == 0) {  // and keep increasing into the nonce 
-      ctx->input[14] += 1;  
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
    }
  }
 }
@ -83,7 +83,7 @@ static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
    chacha_block(ctx);
    ctx->output_available = 16; // (assign again to suppress static analysis warning)
  }
-  const uint32_t x = ctx->output[16 - ctx->output_available];  
+  const uint32_t x = ctx->output[16 - ctx->output_available];
  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
  ctx->output_available--;
  return x;
@ -94,9 +94,9 @@ static inline uint32_t read32(const uint8_t* p, size_t idx32) {
  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
 }

-static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) 
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
 {
-  // since we only use chacha for randomness (and not encryption) we 
+  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
  memset(ctx, 0, sizeof(*ctx));
@ -110,7 +110,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  ctx->input[12] = 0;
  ctx->input[13] = 0;
  ctx->input[14] = (uint32_t)nonce;
-  ctx->input[15] = (uint32_t)(nonce >> 32);  
+  ctx->input[15] = (uint32_t)(nonce >> 32);
 }

 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
@ -184,7 +184,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
  return true;
 }
-#elif defined(__linux__) 
+#elif defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
@ -241,8 +241,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif

-static uintptr_t os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
  #if defined(_WIN32)
    LARGE_INTEGER pcount;
    QueryPerformanceCounter(&pcount);
@ -267,10 +267,10 @@ static uintptr_t os_random_weak(uintptr_t extra_seed) {
 void _mi_random_init(mi_random_ctx_t* ctx) {
  uint8_t key[32];
  if (!os_random_buf(key, sizeof(key))) {
-    // if we fail to get random data from the OS, we fall back to a 
+    // if we fail to get random data from the OS, we fall back to a
    // weak random source based on the current time
    _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = os_random_weak(0);
+    uintptr_t x = _os_random_weak(0);
    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
      x = _mi_random_shuffle(x);
      ((uint32_t*)key)[i] = (uint32_t)x;
@ -280,7 +280,7 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
 }

 /* --------------------------------------------------------
-test vectors from <https://tools.ietf.org/html/rfc8439> 
+test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
 /*
 static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
--- a/src/segment.c
+++ b/src/segment.c
@ -861,12 +861,12 @@ Note: the current implementation is one possible design;
 another way might be to keep track of abandoned segments
 in the regions. This would have the advantage of keeping
 all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to 
-scan abandoned segments efficiently in that case as they 
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
 would be spread among all other segments in the regions.
 ----------------------------------------------------------- */

-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers 
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
 // to put in a tag that increments on update to avoid the A-B-A problem.
 #define MI_TAGGED_MASK   MI_SEGMENT_MASK
 typedef uintptr_t        mi_tagged_segment_t;
@ -882,7 +882,7 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 }

 // This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of 
+// this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
 static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL

@ -908,7 +908,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 }

 // Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void) 
+static bool mi_abandoned_visited_revisit(void)
 {
  // quick check if the visited list is empty
  if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
@ -974,12 +974,12 @@ static mi_segment_t* mi_abandoned_pop(void) {
  segment = mi_tagged_segment_ptr(ts);
  if (mi_likely(segment == NULL)) {
    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;  
+      return NULL;
    }
  }

  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending, 
+  // a segment to be decommitted while a read is still pending,
  // and a tagged pointer to prevent A-B-A link corruption.
  // (this is called from `memory.c:_mi_mem_free` for example)
  mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
@ -1192,7 +1192,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice
      // free the segment (by forced reclaim) to make it available to other threads.
      // note1: we prefer to free a segment as that might lead to reclaiming another
      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly 
+      // note2: we could in principle optimize this by skipping reclaim and directly
      // freeing but that would violate some invariants temporarily)
      mi_segment_reclaim(segment, heap, 0, NULL, tld);
    }
@ -1216,7 +1216,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice


 /* -----------------------------------------------------------
-   Reclaim or allocate  
+   Reclaim or allocate
 ----------------------------------------------------------- */

 static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
@ -1293,6 +1293,34 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
  return page;
 }

+// free huge block from another thread
+void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately by any thread
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_get_default_heap();
+  // paranoia: if this it the last reference, the cas should always succeed
+  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_segments_tld_t* tld = &heap->tld->segments;
+    const size_t bsize = mi_page_usable_block_size(page);
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&tld->stats->large, bsize); 
+    }
+    else {
+      _mi_stat_decrease(&tld->stats->huge, bsize);
+    }
+    // mi_segments_track_size((long)segment->segment_size, tld);
+    _mi_segment_page_free(page, true, tld);
+  }
+}
+
 /* -----------------------------------------------------------
   Page allocation and free
 ----------------------------------------------------------- */