Merge branch 'dev' into bun

2025-08-24 00:04:48 +03:00 · 2022-02-02 19:56:27 -08:00 · 2022-02-02 19:56:27 -08:00 · f4e221917b
commit f4e221917b
parent d2e727f0e8 32da74e032
160 changed files with 4593 additions and 2473 deletions
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -14,31 +14,14 @@ terms of the MIT license. A copy of the license can be found in the file
 // Aligned Allocation
 // ------------------------------------------------------

-static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept {
-  // note: we don't require `size > offset`, we just guarantee that
-  // the address at offset is aligned regardless of the allocated size.
-  mi_assert(alignment > 0);
-  if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+// Fallback primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(size <= PTRDIFF_MAX);
+  mi_assert_internal(alignment!=0 && _mi_is_power_of_two(alignment) && alignment <= MI_ALIGNMENT_MAX);
+
  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
-  
-  // try if there is a small block available with just the right alignment
  const size_t padsize = size + MI_PADDING_SIZE;
-  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap,padsize);
-    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
-    if (mi_likely(page->free != NULL && is_aligned))
-    {
-      #if MI_STAT>1
-      mi_heap_stat_increase( heap, malloc, size);
-      #endif
-      void* p = _mi_page_malloc(heap,page,padsize); // TODO: inline _mi_page_malloc
-      mi_assert_internal(p != NULL);
-      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
-      if (zero) _mi_block_zero_init(page,p,size);
-      return p;
-    }
-  }

  // use regular allocation if it is guaranteed to fit the alignment constraints
  if (offset==0 && alignment<=padsize && padsize<=MI_MEDIUM_OBJ_SIZE_MAX && (padsize&align_mask)==0) {
@ -46,7 +29,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
    return p;
  }
-  
+
  // otherwise over-allocate
  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
  if (p == NULL) return NULL;
@ -55,21 +38,90 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
  mi_assert_internal(adjust <= alignment);
  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
-  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true); 
+  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true);
  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
-  mi_assert_internal( p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p),_mi_ptr_page(aligned_p),aligned_p) );
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
  return aligned_p;
 }

+// Primitive aligned allocation
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0);
+  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  if (mi_unlikely(alignment > MI_ALIGNMENT_MAX)) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
+    #endif
+    return NULL;
+  }
+  if (mi_unlikely(size > PTRDIFF_MAX)) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)                                                    
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+
+  // try first if there happens to be a small block available with just the right alignment
+  if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
+    if (mi_likely(page->free != NULL && is_aligned))
+    {
+      #if MI_STAT>1
+      mi_heap_stat_increase(heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap, page, padsize); // TODO: inline _mi_page_malloc
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      if (zero) { _mi_block_zero_init(page, p, size); }
+      return p;
+    }
+  }
+  // fallback
+  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+}
+
+
+// ------------------------------------------------------
+// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// ------------------------------------------------------

 mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
 }

 mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+  #if !MI_PADDING
+  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
+  if (!_mi_is_power_of_two(alignment)) return NULL;
+  if (mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX))
+  #else
+  // with padding, we can only guarantee this for fixed alignments
+  if (mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
+                && size <= MI_SMALL_SIZE_MAX))
+  #endif
+  {
+    // fast path for common alignment and size
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+  }
 }

+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
 mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
 }
@ -113,6 +165,10 @@ mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t align
 }


+// ------------------------------------------------------
+// Aligned re-allocation
+// ------------------------------------------------------
+
 static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
  mi_assert(alignment > 0);
  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -17,20 +17,22 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ------------------------------------------------------
   Override system malloc on macOS
   This is done through the malloc zone interface.
-   It seems we also need to interpose (see `alloc-override.c`)
-   or otherwise we get zone errors as there are usually 
-   already allocations done by the time we take over the 
-   zone. Unfortunately, that means we need to replace
-   the `free` with a checked free (`cfree`) impacting 
-   performance.
+   It seems to be most robust in combination with interposing
+   though or otherwise we may get zone errors as there are could
+   be allocations done by the time we take over the 
+   zone. 
 ------------------------------------------------------ */

 #include <AvailabilityMacros.h>
 #include <malloc/malloc.h>
 #include <string.h>  // memset
+#include <stdlib.h>

-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
 // only available from OSX 10.6
 extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
 #endif
@ -40,45 +42,43 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im
 ------------------------------------------------------ */

 static size_t zone_size(malloc_zone_t* zone, const void* p) {
-  UNUSED(zone);
-  if (!mi_is_in_heap_region(p))
-    return 0; // not our pointer, bail out
-
+  MI_UNUSED(zone);
+  //if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out
  return mi_usable_size(p);
 }

 static void* zone_malloc(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_malloc(size);
 }

 static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_calloc(count, size);
 }

 static void* zone_valloc(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_malloc_aligned(size, _mi_os_page_size());
 }

 static void zone_free(malloc_zone_t* zone, void* p) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  mi_free(p);
 }

 static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_realloc(p, newsize);
 }

 static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_malloc_aligned(size,alignment);
 }

 static void zone_destroy(malloc_zone_t* zone) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  // todo: ignore for now?
 }

@ -99,16 +99,21 @@ static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
 }

 static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone); UNUSED(size);
+  MI_UNUSED(zone); MI_UNUSED(size);
  mi_collect(false);
  return 0;
 }

 static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
-  UNUSED(size);
+  MI_UNUSED(size);
  zone_free(zone,p);
 }

+static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone);
+  return mi_is_in_heap_region(p);
+}
+

 /* ------------------------------------------------------
   Introspection members
@ -120,43 +125,43 @@ static kern_return_t intro_enumerator(task_t task, void* p,
                            vm_range_recorder_t recorder)
 {
  // todo: enumerate all memory
-  UNUSED(task); UNUSED(p); UNUSED(type_mask); UNUSED(zone_address);
-  UNUSED(reader); UNUSED(recorder);
+  MI_UNUSED(task); MI_UNUSED(p); MI_UNUSED(type_mask); MI_UNUSED(zone_address);
+  MI_UNUSED(reader); MI_UNUSED(recorder);
  return KERN_SUCCESS;
 }

 static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return mi_good_size(size);
 }

 static boolean_t intro_check(malloc_zone_t* zone) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return true;
 }

 static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
-  UNUSED(zone); UNUSED(verbose);
+  MI_UNUSED(zone); MI_UNUSED(verbose);
  mi_stats_print(NULL);
 }

 static void intro_log(malloc_zone_t* zone, void* p) {
-  UNUSED(zone); UNUSED(p);
+  MI_UNUSED(zone); MI_UNUSED(p);
  // todo?
 }

 static void intro_force_lock(malloc_zone_t* zone) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  // todo?
 }

 static void intro_force_unlock(malloc_zone_t* zone) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  // todo?
 }

 static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  // todo...
  stats->blocks_in_use = 0;
  stats->size_in_use = 0;
@ -165,7 +170,7 @@ static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
 }

 static boolean_t intro_zone_locked(malloc_zone_t* zone) {
-  UNUSED(zone);
+  MI_UNUSED(zone);
  return false;
 }

@ -174,7 +179,228 @@ static boolean_t intro_zone_locked(malloc_zone_t* zone) {
  At process start, override the default allocator
 ------------------------------------------------------ */

-static malloc_zone_t* mi_get_default_zone()
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
+static malloc_introspection_t mi_introspect = {
+  .enumerator = &intro_enumerator,
+  .good_size = &intro_good_size,
+  .check = &intro_check,
+  .print = &intro_print,
+  .log = &intro_log,
+  .force_lock = &intro_force_lock,
+  .force_unlock = &intro_force_unlock,
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  .statistics = &intro_statistics,
+  .zone_locked = &intro_zone_locked,
+#endif
+};
+
+static malloc_zone_t mi_malloc_zone = {
+  // note: even with designators, the order is important for C++ compilation
+  //.reserved1 = NULL,
+  //.reserved2 = NULL,
+  .size = &zone_size,
+  .malloc = &zone_malloc,
+  .calloc = &zone_calloc,
+  .valloc = &zone_valloc,
+  .free = &zone_free,
+  .realloc = &zone_realloc,
+  .destroy = &zone_destroy,
+  .zone_name = "mimalloc",
+  .batch_malloc = &zone_batch_malloc,
+  .batch_free = &zone_batch_free,
+  .introspect = &mi_introspect,  
+#if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
+  #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
+  .version = 10,
+  #else
+  .version = 9,
+  #endif
+  // switch to version 9+ on OSX 10.6 to support memalign.
+  .memalign = &zone_memalign,
+  .free_definite_size = &zone_free_definite_size,
+  .pressure_relief = &zone_pressure_relief,
+  #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
+  .claimed_address = &zone_claimed_address,
+  #endif
+#else
+  .version = 4,
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT)
+
+// ------------------------------------------------------
+// Override malloc_xxx and malloc_zone_xxx api's to use only 
+// our mimalloc zone. Since even the loader uses malloc
+// on macOS, this ensures that all allocations go through
+// mimalloc (as all calls are interposed).
+// The main `malloc`, `free`, etc calls are interposed in `alloc-override.c`,
+// Here, we also override macOS specific API's like
+// `malloc_zone_calloc` etc. see <https://github.com/aosm/libmalloc/blob/master/man/malloc_zone_malloc.3>
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
+{
+  static bool init;
+  if (mi_unlikely(!init)) { 
+    init = true;
+    malloc_zone_register(&mi_malloc_zone);  // by calling register we avoid a zone error on free (see <http://eatmyrandom.blogspot.com/2010/03/mallocfree-interception-on-mac-os-x.html>)
+  }
+  return &mi_malloc_zone;
+}
+
+mi_decl_externc int  malloc_jumpstart(uintptr_t cookie);
+mi_decl_externc void _malloc_fork_prepare(void);
+mi_decl_externc void _malloc_fork_parent(void);
+mi_decl_externc void _malloc_fork_child(void);
+
+
+static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) {
+  MI_UNUSED(size); MI_UNUSED(flags);
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_zone (void) {   
+  return mi_get_default_zone();
+}
+
+static malloc_zone_t* mi_malloc_default_purgeable_zone(void) {
+  return mi_get_default_zone();
+}
+
+static void mi_malloc_destroy_zone(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  // nothing.
+}
+
+static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) {
+  MI_UNUSED(task); MI_UNUSED(mr);
+  if (addresses != NULL) *addresses = NULL;
+  if (count != NULL) *count = 0;
+  return KERN_SUCCESS;
+}
+
+static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) {  
+  return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name);
+}
+
+static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) {  
+  MI_UNUSED(zone); MI_UNUSED(name);
+}
+
+static int mi_malloc_jumpstart(uintptr_t cookie) {
+  MI_UNUSED(cookie);
+  return 1; // or 0 for no error?
+}
+
+static void mi__malloc_fork_prepare(void) {
+  // nothing  
+}
+static void mi__malloc_fork_parent(void) {
+  // nothing
+}
+static void mi__malloc_fork_child(void) {
+  // nothing
+}
+
+static void mi_malloc_printf(const char* fmt, ...) {
+  MI_UNUSED(fmt);
+}
+
+static bool zone_check(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+  return true;
+}
+
+static malloc_zone_t* zone_from_ptr(const void* p) {
+  MI_UNUSED(p);
+  return mi_get_default_zone();
+}
+
+static void zone_log(malloc_zone_t* zone, void* p) {
+  MI_UNUSED(zone); MI_UNUSED(p);
+}
+
+static void zone_print(malloc_zone_t* zone, bool b) {
+  MI_UNUSED(zone); MI_UNUSED(b);
+}
+
+static void zone_print_ptr_info(void* p) {
+  MI_UNUSED(p);
+}
+
+static void zone_register(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+static void zone_unregister(malloc_zone_t* zone) {
+  MI_UNUSED(zone);
+}
+
+// use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+// See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+struct mi_interpose_s {
+  const void* replacement;
+  const void* target;
+};
+#define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
+#define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+#define MI_INTERPOSE_ZONE(fun)          MI_INTERPOSE_FUN(malloc_##fun,fun)
+__attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+{
+
+  MI_INTERPOSE_MI(malloc_create_zone),
+  MI_INTERPOSE_MI(malloc_default_purgeable_zone),
+  MI_INTERPOSE_MI(malloc_default_zone),
+  MI_INTERPOSE_MI(malloc_destroy_zone),
+  MI_INTERPOSE_MI(malloc_get_all_zones),
+  MI_INTERPOSE_MI(malloc_get_zone_name),
+  MI_INTERPOSE_MI(malloc_jumpstart),  
+  MI_INTERPOSE_MI(malloc_printf),
+  MI_INTERPOSE_MI(malloc_set_zone_name),
+  MI_INTERPOSE_MI(_malloc_fork_child),
+  MI_INTERPOSE_MI(_malloc_fork_parent),
+  MI_INTERPOSE_MI(_malloc_fork_prepare),
+
+  MI_INTERPOSE_ZONE(zone_batch_free),
+  MI_INTERPOSE_ZONE(zone_batch_malloc),
+  MI_INTERPOSE_ZONE(zone_calloc),
+  MI_INTERPOSE_ZONE(zone_check),
+  MI_INTERPOSE_ZONE(zone_free),
+  MI_INTERPOSE_ZONE(zone_from_ptr),
+  MI_INTERPOSE_ZONE(zone_log),
+  MI_INTERPOSE_ZONE(zone_malloc),
+  MI_INTERPOSE_ZONE(zone_memalign),
+  MI_INTERPOSE_ZONE(zone_print),
+  MI_INTERPOSE_ZONE(zone_print_ptr_info),
+  MI_INTERPOSE_ZONE(zone_realloc),
+  MI_INTERPOSE_ZONE(zone_register),
+  MI_INTERPOSE_ZONE(zone_unregister),
+  MI_INTERPOSE_ZONE(zone_valloc)
+};
+
+
+#else
+
+// ------------------------------------------------------
+// hook into the zone api's without interposing
+// This is the official way of adding an allocator but
+// it seems less robust than using interpose.
+// ------------------------------------------------------
+
+static inline malloc_zone_t* mi_get_default_zone(void)
 {
  // The first returned zone is the real default
  malloc_zone_t** zones = NULL;
@ -189,70 +415,20 @@ static malloc_zone_t* mi_get_default_zone()
  }
 }

-static malloc_introspection_t mi_introspect = {
-  .enumerator = &intro_enumerator,
-  .good_size = &intro_good_size,
-  .check = &intro_check,
-  .print = &intro_print,
-  .log = &intro_log,
-  .force_lock = &intro_force_lock,
-  .force_unlock = &intro_force_unlock,
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  .zone_locked = &intro_zone_locked,
-  .statistics = &intro_statistics,
-#endif
-};
-
-static malloc_zone_t mi_malloc_zone = {
-  .size = &zone_size,
-  .zone_name = "mimalloc",
-  .introspect = &mi_introspect,
-  .malloc = &zone_malloc,
-  .calloc = &zone_calloc,
-  .valloc = &zone_valloc,
-  .free = &zone_free,
-  .realloc = &zone_realloc,
-  .destroy = &zone_destroy,
-  .batch_malloc = &zone_batch_malloc,
-  .batch_free = &zone_batch_free,
-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // switch to version 9 on OSX 10.6 to support memalign.
-  .version = 9,
-  .memalign = &zone_memalign,
-  .free_definite_size = &zone_free_definite_size,
-  .pressure_relief = &zone_pressure_relief,
+#if defined(__clang__)
+__attribute__((constructor(0))) 
 #else
-  .version = 4,
+__attribute__((constructor))      // seems not supported by g++-11 on the M1
 #endif
-};
-
-
-#if defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
-
-static malloc_zone_t *mi_malloc_default_zone(void) {
-  return &mi_malloc_zone;
-}
-// TODO: should use the macros in alloc-override but they aren't available here.
-__attribute__((used)) static struct {
-  const void *replacement;
-  const void *target;
-} replace_malloc_default_zone[] __attribute__((section("__DATA, __interpose"))) = {
-  { (const void*)mi_malloc_default_zone, (const void*)malloc_default_zone },
-};
-#endif
-
-static void __attribute__((constructor(0))) _mi_macos_override_malloc() {
+static void _mi_macos_override_malloc() {
  malloc_zone_t* purgeable_zone = NULL;

-#if defined(MAC_OS_X_VERSION_10_6) && \
-    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
  // force the purgeable zone to exist to avoid strange bugs
  if (malloc_default_purgeable_zone) {
    purgeable_zone = malloc_default_purgeable_zone();
  }
-#endif
+  #endif

  // Register our zone.
  // thomcc: I think this is still needed to put us in the zone list.
@ -277,5 +453,6 @@ static void __attribute__((constructor(0))) _mi_macos_override_malloc() {
  }

 }
+#endif  // MI_OSX_INTERPOSE

 #endif // MI_MALLOC_OVERRIDE
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -13,15 +13,25 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
 #endif

-#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__APPLE__) && !defined(MI_INTERPOSE)))
+#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) 
+
+#if defined(__APPLE__)
+mi_decl_externc void   vfree(void* p);
+mi_decl_externc size_t malloc_size(const void* p);
+mi_decl_externc size_t malloc_good_size(size_t size);
+#endif
+
+// helper definition for C override of C++ new
+typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;

 // ------------------------------------------------------
 // Override system malloc
 // ------------------------------------------------------

-#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)
-  // use aliasing to alias the exported function to one of our `mi_` functions
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !defined(MI_VALGRIND)
+  // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions
  #if (defined(__GNUC__) && __GNUC__ >= 9)
+    #pragma GCC diagnostic ignored "-Wattributes"  // or we get warnings that nodiscard is ignored on a forward
    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default"), copy(fun)));
  #else
    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default")));
@ -32,7 +42,7 @@ terms of the MIT license. A copy of the license can be found in the file
  #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
  #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
 #else
-  // use forwarding by calling our `mi_` function
+  // otherwise use forwarding by calling our `mi_` function 
  #define MI_FORWARD1(fun,x)      { return fun(x); }
  #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
  #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
@ -40,7 +50,11 @@ terms of the MIT license. A copy of the license can be found in the file
  #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
 #endif

-#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE)    
+  // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for 
+  // functions that are interposed (or the interposing does not work)
+  #define MI_OSX_IS_INTERPOSED
+
  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
  struct mi_interpose_s {
@ -49,6 +63,7 @@ terms of the MIT license. A copy of the license can be found in the file
  };
  #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun }
  #define MI_INTERPOSE_MI(fun)            MI_INTERPOSE_FUN(fun,mi_##fun)
+  
  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
  {
    MI_INTERPOSE_MI(malloc),
@ -60,21 +75,49 @@ terms of the MIT license. A copy of the license can be found in the file
    MI_INTERPOSE_MI(posix_memalign),
    MI_INTERPOSE_MI(reallocf),
    MI_INTERPOSE_MI(valloc),
-    #ifndef MI_OSX_ZONE
-    // some code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
-    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
-    #else
-    // We interpose malloc_default_zone in alloc-override-osx.c
+    MI_INTERPOSE_MI(malloc_size),
+    MI_INTERPOSE_MI(malloc_good_size),
+    MI_INTERPOSE_MI(aligned_alloc),
+    #ifdef MI_OSX_ZONE
+    // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely
    MI_INTERPOSE_MI(free),
-    #endif
-    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(vfree,mi_free),
+    #else
+    // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
+    MI_INTERPOSE_FUN(vfree,mi_cfree),
+    #endif
  };
+
+  #ifdef __cplusplus
+  extern "C" {
+    void  _ZdlPv(void* p);   // delete
+    void  _ZdaPv(void* p);   // delete[]
+    void  _ZdlPvm(void* p, size_t n);  // delete
+    void  _ZdaPvm(void* p, size_t n);  // delete[]
+    void* _Znwm(size_t n);  // new
+    void* _Znam(size_t n);  // new[]
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new nothrow
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new[] nothrow
+  }  
+  __attribute__((used)) static struct mi_interpose_s _mi_cxx_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_FUN(_ZdlPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdaPv,mi_free),
+    MI_INTERPOSE_FUN(_ZdlPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_ZdaPvm,mi_free_size),
+    MI_INTERPOSE_FUN(_Znwm,mi_new),
+    MI_INTERPOSE_FUN(_Znam,mi_new),
+    MI_INTERPOSE_FUN(_ZnwmRKSt9nothrow_t,mi_new_nothrow),
+    MI_INTERPOSE_FUN(_ZnamRKSt9nothrow_t,mi_new_nothrow),
+  };
+  #endif // __cplusplus
+
 #elif defined(_MSC_VER)
  // cannot override malloc unless using a dll.
  // we just override new/delete which does work in a static library.
 #else
-  // On all other systems forward to our API
+  // On all other systems forward to our API  
  void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
  void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
  void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
@ -96,18 +139,21 @@ terms of the MIT license. A copy of the license can be found in the file
  // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
  // ------------------------------------------------------
  #include <new>
-  void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
-  void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)

-  void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
-  void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)
+  #ifndef MI_OSX_IS_INTERPOSED
+    void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p)
+    void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p)

-  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
-  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n)
+    void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n)

-  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-  void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
-  void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); }
+
+    #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+    void operator delete  (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n)
+    #endif
  #endif

  #if (__cplusplus > 201402L && defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5))
@ -122,12 +168,13 @@ terms of the MIT license. A copy of the license can be found in the file
  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
  #endif

-#elif (defined(__GNUC__) || defined(__clang__))
+#elif (defined(__GNUC__) || defined(__clang__)) 
  // ------------------------------------------------------
  // Override by defining the mangled C++ names of the operators (as
  // used by GCC and CLang).
  // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
  // ------------------------------------------------------
+  
  void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p) // delete
  void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
  void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
@ -136,78 +183,83 @@ terms of the MIT license. A copy of the license can be found in the file
  void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
  void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
  void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
-
-  typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+  
  #if (MI_INTPTR_SIZE==8)
    void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
    void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }     
    void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
    void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
-    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
-    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
  #elif (MI_INTPTR_SIZE==4)
    void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
    void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit
+    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); }   
    void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
    void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al)
-    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
-    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
-    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
  #else
-  #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+    #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
  #endif
 #endif // __cplusplus

+// ------------------------------------------------------
+// Further Posix & Unix functions definitions
+// ------------------------------------------------------

 #ifdef __cplusplus
 extern "C" {
 #endif

-// ------------------------------------------------------
-// Posix & Unix functions definitions
-// ------------------------------------------------------
+#ifndef MI_OSX_IS_INTERPOSED
+  // Forward Posix/Unix calls as well
+  void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
+  size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
+  #if !defined(__ANDROID__) && !defined(__FreeBSD__)
+  size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
+  #else
+  size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  #endif

-void   cfree(void* p)                    MI_FORWARD0(mi_free, p)
-void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize)
-size_t malloc_size(const void* p)        MI_FORWARD1(mi_usable_size,p)
-#if !defined(__ANDROID__)
-size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p)
-#else
-size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p)
+  // No forwarding here due to aliasing/name mangling issues
+  void*  valloc(size_t size)               { return mi_valloc(size); }
+  void   vfree(void* p)                    { mi_free(p); }                
+  size_t malloc_good_size(size_t size)     { return mi_malloc_good_size(size); }
+  int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
+  
+  // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
+  // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
+  // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
+  // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
+  #if __USE_ISOC11 
+  void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
+  #endif
 #endif

 // no forwarding here due to aliasing/name mangling issues
-void* valloc(size_t size)                                     { return mi_valloc(size); }
-void* pvalloc(size_t size)                                    { return mi_pvalloc(size); }
-void* reallocarray(void* p, size_t count, size_t size)        { return mi_reallocarray(p, count, size); }
-void* memalign(size_t alignment, size_t size)                 { return mi_memalign(alignment, size); }
-int   posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
-void* _aligned_malloc(size_t alignment, size_t size)          { return mi_aligned_alloc(alignment, size); }
-
-// `aligned_alloc` is only available when __USE_ISOC11 is defined.
-// Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
-// override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
-// Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
-// uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
-#if __USE_ISOC11 
-void* aligned_alloc(size_t alignment, size_t size)   { return mi_aligned_alloc(alignment, size); }
-#endif
-
+void  cfree(void* p)                                    { mi_free(p); } 
+void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
+void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
+void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }

 #if defined(__GLIBC__) && defined(__linux__)
  // forward __libc interface (needed for glibc-based Linux distributions)
-  void* __libc_malloc(size_t size)                  MI_FORWARD1(mi_malloc,size)
-  void* __libc_calloc(size_t count, size_t size)    MI_FORWARD2(mi_calloc,count,size)
-  void* __libc_realloc(void* p, size_t size)        MI_FORWARD2(mi_realloc,p,size)
-  void  __libc_free(void* p)                        MI_FORWARD0(mi_free,p)
-  void  __libc_cfree(void* p)                       MI_FORWARD0(mi_free,p)
+  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
+  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
+  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
+  void  __libc_free(void* p)                            MI_FORWARD0(mi_free,p)
+  void  __libc_cfree(void* p)                           MI_FORWARD0(mi_free,p)

-  void* __libc_valloc(size_t size)                                { return mi_valloc(size); }
-  void* __libc_pvalloc(size_t size)                               { return mi_pvalloc(size); }
-  void* __libc_memalign(size_t alignment, size_t size)            { return mi_memalign(alignment,size); }
+  void* __libc_valloc(size_t size)                      { return mi_valloc(size); }
+  void* __libc_pvalloc(size_t size)                     { return mi_pvalloc(size); }
+  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment,size); }
  int   __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); }
 #endif

--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@ -33,13 +33,19 @@ terms of the MIT license. A copy of the license can be found in the file


 size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+  //if (!mi_is_in_heap_region(p)) return 0;
  return mi_usable_size(p);
 }

 size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+  //if (!mi_is_in_heap_region(p)) return 0;
  return mi_usable_size(p);
 }

+size_t mi_malloc_good_size(size_t size) mi_attr_noexcept {
+  return mi_good_size(size);
+}
+
 void mi_cfree(void* p) mi_attr_noexcept {
  if (mi_is_in_heap_region(p)) {
    mi_free(p);
@ -50,9 +56,9 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
  // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
  // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
  if (p == NULL) return EINVAL;
-  if (alignment % sizeof(void*) != 0) return EINVAL;   // natural alignment
-  if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
-  void* q = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  if (alignment % sizeof(void*) != 0) return EINVAL;                   // natural alignment
+  if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
+  void* q = mi_malloc_aligned(size, alignment);
  if (q==NULL && size != 0) return ENOMEM;
  mi_assert_internal(((uintptr_t)q % alignment) == 0);
  *p = q;
@ -60,7 +66,7 @@ int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept
 }

 mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
-  void* p = (mi_malloc_satisfies_alignment(alignment,size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  void* p = mi_malloc_aligned(size, alignment);
  mi_assert_internal(((uintptr_t)p % alignment) == 0);
  return p;
 }
@ -77,22 +83,40 @@ mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept {
 }

 mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
-  if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
-  if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
-  void* p = (mi_malloc_satisfies_alignment(alignment, size) ? mi_malloc(size) : mi_malloc_aligned(size, alignment));
+  if (mi_unlikely((size&(alignment-1)) != 0)) { // C11 requires alignment>0 && integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  // C11 also requires alignment to be a power-of-two which is checked in mi_malloc_aligned
+  void* p = mi_malloc_aligned(size, alignment);
  mi_assert_internal(((uintptr_t)p % alignment) == 0);
  return p;
 }

 void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
  void* newp = mi_reallocn(p,count,size);
-  if (newp==NULL) errno = ENOMEM;
+  if (newp==NULL) { errno = ENOMEM; }
  return newp;
 }

+int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD
+  mi_assert(p != NULL);
+  if (p == NULL) {
+    errno = EINVAL;
+    return EINVAL;
+  }
+  void** op = (void**)p;  
+  void* newp = mi_reallocarray(*op, count, size);
+  if (mi_unlikely(newp == NULL)) return errno;
+  *op = newp;
+  return 0;
+}
+
 void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
  void* res = mi_expand(p, newsize);
-  if (res == NULL) errno = ENOMEM;
+  if (res == NULL) { errno = ENOMEM; }
  return res;
 }

--- a/src/alloc.c
+++ b/src/alloc.c
@ -1,9 +1,13 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for realpath() on Linux
+#endif
+
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 #include "mimalloc-atomic.h"
@ -119,7 +123,7 @@ extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
 void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
  // note: we need to initialize the whole usable block size to zero, not just the requested size,
  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
-  UNUSED(size);
+  MI_UNUSED(size);
  mi_assert_internal(p != NULL);
  mi_assert_internal(mi_usable_size(p) >= size); // size can be zero
  mi_assert_internal(_mi_ptr_page(p)==page);
@ -201,8 +205,8 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
 }
 #else
 static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page);
-  UNUSED(block);
+  MI_UNUSED(page);
+  MI_UNUSED(block);
  return false;
 }
 #endif
@ -274,19 +278,19 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
 }
 #else
 static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page);
-  UNUSED(block);
+  MI_UNUSED(page);
+  MI_UNUSED(block);
 }

 static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(block);
+  MI_UNUSED(block);
  return mi_page_usable_block_size(page);
 }

 static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  UNUSED(page);
-  UNUSED(block);
-  UNUSED(min_size);
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
 }
 #endif

@ -294,7 +298,7 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 #if (MI_STAT < 2)  
-  UNUSED(block);
+  MI_UNUSED(block);
 #endif
  mi_heap_t* const heap = mi_heap_get_default();
  const size_t bsize = mi_page_usable_block_size(page);  
@ -311,7 +315,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 }
 #else
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  UNUSED(page); UNUSED(block);
+  MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif

@ -329,7 +333,7 @@ static void mi_stat_huge_free(const mi_page_t* page) {
 }
 #else
 static void mi_stat_huge_free(const mi_page_t* page) {
-  UNUSED(page);
+  MI_UNUSED(page);
 }
 #endif

@ -431,7 +435,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 }


-static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) {
+static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) mi_attr_noexcept {
  mi_page_t* const page = _mi_segment_page_of(segment, p);
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
  mi_stat_free(page, block);
@ -443,7 +447,7 @@ static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool l
 // (and secure mode) if this was a valid pointer.
 static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
 {
-  UNUSED(msg);
+  MI_UNUSED(msg);
 #if (MI_DEBUG>0)
  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
@ -465,24 +469,23 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 #endif
 #if (MI_DEBUG>0 || MI_SECURE>=4)
  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", p);
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
  }
 #endif
  return segment;
 }

-
-// Free a block
+// Free a block 
 void mi_free(void* p) mi_attr_noexcept
 {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
  if (mi_unlikely(segment == NULL)) return; 

-  const uintptr_t tid = _mi_thread_id();
+  mi_threadid_t tid = _mi_thread_id();
  mi_page_t* const page = _mi_segment_page_of(segment, p);
  mi_block_t* const block = (mi_block_t*)p;
-
-  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
+  
+  if (mi_likely(tid == mi_atomic_load_relaxed(&segment->thread_id) && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
    // local, and not full or aligned
    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
    mi_check_padding(page, block);
@ -570,19 +573,19 @@ void* _mi_externs[] = {
 // ------------------------------------------------------

 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
-  UNUSED_RELEASE(size);
+  MI_UNUSED_RELEASE(size);
  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
  mi_free(p);
 }

 void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
-  UNUSED_RELEASE(alignment);
+  MI_UNUSED_RELEASE(alignment);
  mi_assert(((uintptr_t)p % alignment) == 0);
  mi_free_size(p,size);
 }

 void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
-  UNUSED_RELEASE(alignment);
+  MI_UNUSED_RELEASE(alignment);
  mi_assert(((uintptr_t)p % alignment) == 0);
  mi_free(p);
 }
@ -747,7 +750,7 @@ mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char
 }
 #else
 #include <unistd.h>  // pathconf
-static size_t mi_path_max() {
+static size_t mi_path_max(void) {
  static size_t path_max = 0;
  if (path_max <= 0) {
    long m = pathconf("/",_PC_PATH_MAX);
@ -798,7 +801,10 @@ static bool mi_try_new_handler(bool nothrow) {
    std::set_new_handler(h);
  #endif  
  if (h==NULL) {
-    if (!nothrow) throw std::bad_alloc();
+    _mi_error_message(ENOMEM, "out of memory in 'new'");      
+    if (!nothrow) {
+      throw std::bad_alloc();
+    }
    return false;
  }
  else {
@ -807,13 +813,13 @@ static bool mi_try_new_handler(bool nothrow) {
  }
 }
 #else
-typedef void (*std_new_handler_t)();
+typedef void (*std_new_handler_t)(void);

 #if (defined(__GNUC__) || defined(__clang__))
-std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv() {
+std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv(void) {
  return NULL;
 }
-static std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler(void) {
  return _ZSt15get_new_handlerv();
 }
 #else
@ -826,7 +832,10 @@ static std_new_handler_t mi_get_new_handler() {
 static bool mi_try_new_handler(bool nothrow) {
  std_new_handler_t h = mi_get_new_handler();
  if (h==NULL) {
-    if (!nothrow) exit(ENOMEM);  // cannot throw in plain C, use exit as we are out of memory anyway.
+    _mi_error_message(ENOMEM, "out of memory in 'new'");       
+    if (!nothrow) {
+      abort();  // cannot throw in plain C, use abort
+    }
    return false;
  }
  else {
--- a/src/arena.c
+++ b/src/arena.c
@ -20,7 +20,7 @@ which is sometimes needed for embedded devices or shared memory for example.

 The arena allocation needs to be thread safe and we use an atomic
 bitmap to allocate. The current implementation of the bitmap can
-only do this within a field (`uintptr_t`) so we can allocate at most
+only do this within a field (`size_t`) so we can allocate at most
 blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
 can lead to fragmentation but fortunately most objects will be regions
 of 256MiB in practice.
@ -62,18 +62,18 @@ typedef struct mi_arena_s {
  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
  int      numa_node;                     // associated NUMA node
  bool     is_zero_init;                  // is the arena zero initialized?
-  bool     is_committed;                  // is the memory fully committed? (if so, block_committed == NULL)
+  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
  bool     is_large;                      // large- or huge OS pages (always committed)
-  _Atomic(uintptr_t) search_idx;          // optimization to start the search for free blocks
+  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
+  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;


 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(uintptr_t)   mi_arena_count; // = 0
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0


 /* -----------------------------------------------------------
@ -129,8 +129,8 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
  *memid     = mi_arena_id_create(arena_index, bitmap_index);
  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
  *large     = arena->is_large;
-  *is_pinned = (arena->is_large || arena->is_committed);
-  if (arena->is_committed) {
+  *is_pinned = (arena->is_large || !arena->allow_decommit);
+  if (arena->blocks_committed == NULL) {
    // always committed
    *commit = true;
  }
@ -245,12 +245,13 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s
      return;
    }
    // potentially decommit
-    if (arena->is_committed) {
-      mi_assert_internal(all_committed); 
+    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
    }
    else {
      mi_assert_internal(arena->blocks_committed != NULL);
      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
+      // todo: use reset instead of decommit on windows?
      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
    }
    // and make it available to others again 
@ -271,7 +272,7 @@ static bool mi_arena_add(mi_arena_t* arena) {
  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
  mi_assert_internal(arena->block_count > 0);

-  uintptr_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
  if (i >= MI_MAX_ARENAS) {
    mi_atomic_decrement_acq_rel(&mi_arena_count);
    return false;
@ -282,12 +283,14 @@ static bool mi_arena_add(mi_arena_t* arena) {

 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept
 {
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
  if (is_large) {
    mi_assert_internal(is_committed);
    is_committed = true;
  }
  
-  const size_t bcount = mi_block_count_of_size(size);
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE; 
  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
  const size_t bitmaps = (is_committed ? 2 : 3);
  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
@ -300,12 +303,16 @@ bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_la
  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
  arena->is_large     = is_large;
  arena->is_zero_init = is_zero;
-  arena->is_committed = is_committed;
+  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
  arena->search_idx   = 0;
  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = (is_committed ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
  // the bitmaps are already zero initialized due to os_alloc
-  // just claim leftover blocks if needed
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && is_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+  // and claim leftover blocks if needed (so we never allocate there)
  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
  mi_assert_internal(post >= 0);
  if (post > 0) {
@ -321,7 +328,7 @@ bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_la
 // Reserve a range of regular OS memory
 int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept 
 {
-  size = _mi_os_good_alloc_size(size);
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
  bool large = allow_large;
  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
  if (start==NULL) return ENOMEM;
@ -330,7 +337,7 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe
    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
    return ENOMEM;
  }
-  _mi_verbose_message("reserved %zu kb memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
  return 0;
 }

@ -347,10 +354,10 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
  size_t pages_reserved = 0;
  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
    return ENOMEM;
  }
-  _mi_verbose_message("numa node %i: reserved %zu gb huge pages (of the %zu gb requested)\n", numa_node, pages_reserved, pages);
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);

  if (!mi_manage_os_memory(p, hsize, true, true, true, numa_node)) {
    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
@ -389,7 +396,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
 }

 int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  UNUSED(max_secs);
+  MI_UNUSED(max_secs);
  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
  if (pages_reserved != NULL) *pages_reserved = 0;
  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`uintptr_t`)
+represeted as an array of fields where each field is a machine word (`size_t`)

 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
@ -26,12 +26,12 @@ between the fields. (This is used in arena allocation)
 ----------------------------------------------------------- */

 // The bit mask for a given number of blocks at a specified bit index.
-static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
  mi_assert_internal(count > 0);
  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
  if (count == 0) return 0;
-  return ((((uintptr_t)1 << count) - 1) << bitidx);
+  return ((((size_t)1 << count) - 1) << bitidx);
 }


@ -46,27 +46,27 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_
 {
  mi_assert_internal(bitmap_idx != NULL);
  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map  = mi_atomic_load_relaxed(field);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut

  // search for 0-bit sequence of length count
-  const uintptr_t mask = mi_bitmap_mask_(count, 0);
-  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;

 #ifdef MI_HAVE_FAST_BITSCAN
  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
 #else
  size_t bitidx = 0;               // otherwise start at 0
 #endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx

  // scan linearly for a free range of zero bits
  while (bitidx <= bitidx_max) {
-    const uintptr_t mapm = map & m;
+    const size_t mapm = map & m;
    if (mapm == 0) {  // are the mask bits free at bitidx?
      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const uintptr_t newmap = map | m;
+      const size_t newmap = map | m;
      mi_assert_internal((newmap^map) >> bitidx == mask);
      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
        // no success, another thread claimed concurrently.. keep going (with updated `map`)
@ -121,10 +121,10 @@ bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, c
 bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  uintptr_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
  return ((prev & mask) == mask);
 }

@ -134,10 +134,10 @@ bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, m
 bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  uintptr_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
  return ((prev & mask) == 0);
 }
@ -146,9 +146,9 @@ bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi
 static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
  const size_t idx = mi_bitmap_index_field(bitmap_idx);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
  return ((field & mask) == mask);
 }
@ -176,8 +176,8 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
  mi_assert_internal(bitmap_idx != NULL);
  
  // check initial trailing zeros
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map = mi_atomic_load_relaxed(field);  
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);  
  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
  if (initial == 0)     return false;
@ -186,11 +186,11 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit

  // scan ahead
  size_t found = initial;
-  uintptr_t mask = 0;     // mask bits for the final field
+  size_t mask = 0;     // mask bits for the final field
  while(found < count) {
    field++;
    map = mi_atomic_load_relaxed(field);
-    const uintptr_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
    mask = mi_bitmap_mask_(mask_bits, 0);
    if ((map & mask) != 0) return false;
    found += mask_bits;
@ -199,13 +199,13 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit

  // found range of zeros up to the final field; mask contains mask in the final field
  // now claim it atomically
-  _Atomic(uintptr_t)* const final_field = field;
-  const uintptr_t final_mask = mask;
-  _Atomic(uintptr_t)* const initial_field = &bitmap[idx];
-  const uintptr_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);

  // initial field
-  uintptr_t newmap;
+  size_t newmap;
  field = initial_field;
  map = mi_atomic_load_relaxed(field);
  do {
@ -280,8 +280,8 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm
 }

 // Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, uintptr_t* pre_mask, uintptr_t* mid_mask, uintptr_t* post_mask) {
-  UNUSED_RELEASE(bitmap_fields);
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED_RELEASE(bitmap_fields);
  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
  if (mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS)) {
    *pre_mask = mi_bitmap_mask_(count, bitidx);
@ -308,13 +308,13 @@ static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_
 // Returns `true` if all `count` bits were 1 previously.
 bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
  bool all_one = true;
-  _Atomic(uintptr_t)*field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
  if ((prev & pre_mask) != pre_mask) all_one = false;
  while(mid_count-- > 0) {
    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
@ -331,14 +331,14 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
 bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
  bool all_zero = true;
  bool any_zero = false;
-  _Atomic(uintptr_t)*field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
  if ((prev & pre_mask) != 0) all_zero = false;
  if ((prev & pre_mask) != pre_mask) any_zero = true;
  while (mid_count-- > 0) {
@ -360,14 +360,14 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co
 // `any_ones` is `true` if there was at least one bit set to one.
 static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  uintptr_t pre_mask;
-  uintptr_t mid_mask;
-  uintptr_t post_mask;
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
  bool all_ones = true;
  bool any_ones = false;
-  _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t prev = mi_atomic_load_relaxed(field++);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
  if ((prev & pre_mask) != pre_mask) all_ones = false;
  if ((prev & pre_mask) != 0) any_ones = true;
  while (mid_count-- > 0) {
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`uintptr_t`)
+represeted as an array of fields where each field is a machine word (`size_t`)

 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
@ -24,11 +24,11 @@ between the fields. (This is used in arena allocation)
  Bitmap definition
 ----------------------------------------------------------- */

-#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set

-// An atomic bitmap of `uintptr_t` fields
-typedef _Atomic(uintptr_t)  mi_bitmap_field_t;
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
 typedef mi_bitmap_field_t*  mi_bitmap_t;

 // A bitmap index is the index of the bit in a bitmap.
--- a/src/heap.c
+++ b/src/heap.c
@ -50,9 +50,9 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void

 #if MI_DEBUG>=2
 static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
  mi_assert_internal(mi_page_heap(page) == heap);
  mi_segment_t* segment = _mi_page_segment(page);
  mi_assert_internal(segment->thread_id == heap->thread_id);
@ -86,8 +86,8 @@ typedef enum mi_collect_e {


 static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
-  UNUSED(arg2);
-  UNUSED(heap);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
  mi_collect_t collect = *((mi_collect_t*)arg_collect);
  _mi_page_free_collect(page, collect >= MI_FORCE);
@ -104,10 +104,10 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
 }

 static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
  return true; // don't break
 }
@ -262,10 +262,10 @@ static void mi_heap_free(mi_heap_t* heap) {
 ----------------------------------------------------------- */

 static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
-  UNUSED(arg1);
-  UNUSED(arg2);
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);

  // ensure no more thread_delayed_free will be added
  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
@ -333,7 +333,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
  Safe Heap delete
 ----------------------------------------------------------- */

-// Tranfer the pages from one heap to the other
+// Transfer the pages from one heap to the other
 static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
  mi_assert_internal(heap!=NULL);
  if (from==NULL || from->page_count == 0) return;
@ -422,8 +422,8 @@ bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {


 static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
  bool* found = (bool*)vfound;
  mi_segment_t* segment = _mi_page_segment(page);
  void* start = _mi_page_start(segment, page, NULL);
@ -521,8 +521,8 @@ typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_


 static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
-  UNUSED(heap);
-  UNUSED(pq);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
  mi_heap_area_ex_t xarea;
  const size_t bsize = mi_page_block_size(page);
--- a/src/init.c
+++ b/src/init.c
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -102,6 +102,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  false
 };

+
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;

@ -141,7 +142,7 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 static void mi_heap_main_init(void) {
  if (_mi_heap_main.cookie == 0) {
    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&mi_heap_main_init);
+    _mi_heap_main.cookie = _mi_os_random_weak((uintptr_t)&mi_heap_main_init);
    _mi_random_init(&_mi_heap_main.random);
    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
@ -274,12 +275,6 @@ static bool _mi_heap_done(mi_heap_t* heap) {

 static void _mi_thread_done(mi_heap_t* default_heap);

-#ifdef __wasi__
-// no pthreads in the WebAssembly Standard Interface
-#elif !defined(_WIN32)
-#define MI_USE_PTHREADS
-#endif
-
 #if defined(_WIN32) && defined(MI_SHARED_LIB)
  // nothing to do as it is done in DllMain
 #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
@ -299,7 +294,6 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #elif defined(MI_USE_PTHREADS)
  // use pthread local storage keys to detect thread ending
  // (and used with MI_TLS_PTHREADS for the default heap)
-  #include <pthread.h>
  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
  static void mi_pthread_done(void* value) {
    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
@ -331,6 +325,12 @@ bool _mi_is_main_thread(void) {
  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
 }

+static _Atomic(size_t) thread_count = ATOMIC_VAR_INIT(1);
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
 {
@ -343,6 +343,7 @@ void mi_thread_init(void) mi_attr_noexcept
  if (_mi_heap_init()) return;  // returns true if already initialized

  _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_atomic_increment_relaxed(&thread_count);
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }

@ -351,6 +352,7 @@ void mi_thread_done(void) mi_attr_noexcept {
 }

 static void _mi_thread_done(mi_heap_t* heap) {
+  mi_atomic_decrement_relaxed(&thread_count);
  _mi_stat_decrease(&_mi_stats_main.threads, 1);

  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
@ -441,10 +443,12 @@ static void mi_process_load(void) {
  mi_heap_main_init();
  #if defined(MI_TLS_RECURSE_GUARD)
  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
-  UNUSED(dummy);
+  MI_UNUSED(dummy);
  #endif
  os_preloading = false;
-  atexit(&mi_process_done);
+  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
+  atexit(&mi_process_done);  
+  #endif
  _mi_options_init();
  mi_process_init();
  //mi_stats_reset();-
@ -478,10 +482,11 @@ static void mi_detect_cpu_features(void) {
 void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  if (_mi_process_is_initialized) return;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
  _mi_process_is_initialized = true;
  mi_process_setup_auto_thread_done();

-  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
+  
  mi_detect_cpu_features();
  _mi_os_init();
  mi_heap_main_init();
@ -490,15 +495,30 @@ void mi_process_init(void) mi_attr_noexcept {
  #endif
  _mi_verbose_message("secure level: %d\n", MI_SECURE);
  mi_thread_init();
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
+  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
+  FlsSetValue(mi_fls_key, NULL);
+  #endif
+
  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)

  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
-    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    if (reserve_at != -1) {
+      mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
+    } else {
+      mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    }
  } 
  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
    long ksize = mi_option_get(mi_option_reserve_os_memory);
-    if (ksize > 0) mi_reserve_os_memory((size_t)ksize*KiB, true, true);
+    if (ksize > 0) {
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true, true);
+    }
  }
 }

@ -512,8 +532,7 @@ static void mi_process_done(void) {
  process_done = true;

  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
-  FlsSetValue(mi_fls_key, NULL);  // don't call main-thread callback
-  FlsFree(mi_fls_key);            // call thread-done on all threads to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
  #endif
  
  #ifndef MI_SKIP_COLLECT_ON_EXIT
@ -538,17 +557,40 @@ static void mi_process_done(void) {
 #if defined(_WIN32) && defined(MI_SHARED_LIB)
  // Windows DLL: easy to hook into process_init and thread_done
  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    UNUSED(reserved);
-    UNUSED(inst);
+    MI_UNUSED(reserved);
+    MI_UNUSED(inst);
    if (reason==DLL_PROCESS_ATTACH) {
      mi_process_load();
    }
-    else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) mi_thread_done();
+    else if (reason==DLL_PROCESS_DETACH) {
+      mi_process_done();
    }
+    else if (reason==DLL_THREAD_DETACH) {
+      if (!mi_is_redirected()) {
+        mi_thread_done();
+      }
+    }    
    return TRUE;
  }

+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_load();
+    return 0;
+  }
+  typedef int(*_mi_crt_callback_t)(void);
+  #if defined(_M_X64) || defined(_M_ARM64)
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
 #elif defined(__cplusplus)
  // C++: use static initialization to detect process start
  static bool _mi_process_init(void) {
@ -563,24 +605,6 @@ static void mi_process_done(void) {
    mi_process_load();
  }

-#elif defined(_MSC_VER)
-  // MSVC: use data section magic for static libraries
-  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
-  static int _mi_process_init(void) {
-    mi_process_load();
-    return 0;
-  }
-  typedef int(*_crt_cb)(void);
-  #ifdef _M_X64
-    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
-    #pragma section(".CRT$XIU", long, read)
-  #else
-    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
-  #endif
-  #pragma data_seg(".CRT$XIU")
-  _crt_cb _mi_msvc_initu[] = { &_mi_process_init };
-  #pragma data_seg()
-
 #else
 #pragma message("define a way to call mi_process_load on your platform")
 #endif
--- a/src/options.c
+++ b/src/options.c
@ -19,10 +19,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif


-static uintptr_t mi_max_error_count   = 16; // stop outputting errors after this
-static uintptr_t mi_max_warning_count = 16; // stop outputting warnings after this
+static size_t mi_max_error_count   = 16; // stop outputting errors after this
+static size_t mi_max_warning_count = 16; // stop outputting warnings after this

-static void mi_add_stderr_output();
+static void mi_add_stderr_output(void);

 int mi_version(void) mi_attr_noexcept {
  return MI_MALLOC_VERSION;
@ -76,6 +76,7 @@ static mi_option_desc_t options[_mi_option_last] =
  #endif
  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
+  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
@ -103,7 +104,7 @@ void _mi_options_init(void) {
  mi_add_stderr_output(); // now it safe to use stderr for output
  for(int i = 0; i < _mi_option_last; i++ ) {
    mi_option_t option = (mi_option_t)i;
-    long l = mi_option_get(option); UNUSED(l); // initialize
+    long l = mi_option_get(option); MI_UNUSED(l); // initialize
    if (option != mi_option_verbose) {
      mi_option_desc_t* desc = &options[option];
      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
@ -113,7 +114,7 @@ void _mi_options_init(void) {
  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
 }

-long mi_option_get(mi_option_t option) {
+mi_decl_nodiscard long mi_option_get(mi_option_t option) {
  mi_assert(option >= 0 && option < _mi_option_last);
  mi_option_desc_t* desc = &options[option];
  mi_assert(desc->option == option);  // index should match the option
@ -139,7 +140,7 @@ void mi_option_set_default(mi_option_t option, long value) {
  }
 }

-bool mi_option_is_enabled(mi_option_t option) {
+mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) {
  return (mi_option_get(option) != 0);
 }

@ -161,7 +162,7 @@ void mi_option_disable(mi_option_t option) {


 static void mi_out_stderr(const char* msg, void* arg) {
-  UNUSED(arg);
+  MI_UNUSED(arg);
  #ifdef _WIN32
  // on windows with redirection, the C runtime cannot handle locale dependent output
  // after the main thread closes so we use direct console output.
@ -176,19 +177,19 @@ static void mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((uintptr_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
-static _Atomic(uintptr_t) out_len;
+static _Atomic(size_t) out_len;

 static void mi_out_buf(const char* msg, void* arg) {
-  UNUSED(arg);
+  MI_UNUSED(arg);
  if (msg==NULL) return;
  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
  size_t n = strlen(msg);
  if (n==0) return;
  // claim space
-  uintptr_t start = mi_atomic_add_acq_rel(&out_len, n);
+  size_t start = mi_atomic_add_acq_rel(&out_len, n);
  if (start >= MI_MAX_DELAY_OUTPUT) return;
  // check bound
  if (start+n >= MI_MAX_DELAY_OUTPUT) {
@ -251,27 +252,42 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-static _Atomic(uintptr_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
-static _Atomic(uintptr_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
+static _Atomic(size_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings

 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
 // inside the C runtime causes another message.
+// In some cases (like on macOS) the loader already allocates which
+// calls into mimalloc; if we then access thread locals (like `recurse`)
+// this may crash as the access may call _tlv_bootstrap that tries to 
+// (recursively) invoke malloc again to allocate space for the thread local
+// variables on demand. This is why we use a _mi_preloading test on such
+// platforms. However, C code generator may move the initial thread local address
+// load before the `if` and we therefore split it out in a separate funcion.
 static mi_decl_thread bool recurse = false;

+static mi_decl_noinline bool mi_recurse_enter_prim(void) {
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static mi_decl_noinline void mi_recurse_exit_prim(void) {
+  recurse = false;
+}
+
 static bool mi_recurse_enter(void) {
  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return true;
  #endif
-  if (recurse) return false;
-  recurse = true;
-  return true;
+  return mi_recurse_enter_prim();
 }

 static void mi_recurse_exit(void) {
  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return;
  #endif
-  recurse = false;
+  mi_recurse_exit_prim();
 }

 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
@ -353,7 +369,7 @@ static mi_error_fun* volatile  mi_error_handler; // = NULL
 static _Atomic(void*) mi_error_arg;     // = NULL

 static void mi_error_default(int err) {
-  UNUSED(err);
+  MI_UNUSED(err);
 #if (MI_DEBUG>0) 
  if (err==EFAULT) {
    #ifdef _MSC_VER
@ -399,16 +415,35 @@ void _mi_error_message(int err, const char* fmt, ...) {
 // --------------------------------------------------------

 static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
-  dest[0] = 0;
-  strncpy(dest, src, dest_size - 1);
-  dest[dest_size - 1] = 0;
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
 }

 static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
-  strncat(dest, src, dest_size - 1);
-  dest[dest_size - 1] = 0;
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  mi_strlcpy(dest, src, dest_size);
 }

+#ifdef MI_NO_GETENV
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
 static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
  if (n==0) return 0;
  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
@ -416,7 +451,6 @@ static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
  }
  return (n==0 ? 0 : *s - *t);
 }
-
 #if defined _WIN32
 // On Windows use GetEnvironmentVariable instead of getenv to work
 // reliably even when this is invoked before the C runtime is initialized.
@ -484,7 +518,8 @@ static bool mi_getenv(const char* name, char* result, size_t result_size) {
    return false;
  }
 }
-#endif
+#endif  // !MI_USE_ENVIRON
+#endif  // !MI_NO_GETENV

 static void mi_option_init(mi_option_desc_t* desc) {  
  // Read option value from the environment
@ -513,18 +548,29 @@ static void mi_option_init(mi_option_desc_t* desc) {
      if (desc->option == mi_option_reserve_os_memory) {
        // this option is interpreted in KiB to prevent overflow of `long`
        if (*end == 'K') { end++; }
-        else if (*end == 'M') { value *= KiB; end++; }
-        else if (*end == 'G') { value *= MiB; end++; }
-        else { value = (value + KiB - 1) / KiB; }
-        if (*end == 'B') { end++; }
+        else if (*end == 'M') { value *= MI_KiB; end++; }
+        else if (*end == 'G') { value *= MI_MiB; end++; }
+        else { value = (value + MI_KiB - 1) / MI_KiB; }
+        if (end[0] == 'I' && end[1] == 'B') { end += 2; }
+        else if (*end == 'B') { end++; }
      }
      if (*end == 0) {
        desc->value = value;
        desc->init = INITIALIZED;
      }
      else {
-        _mi_warning_message("environment option mimalloc_%s has an invalid value: %s\n", desc->name, buf);
+        // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
        desc->init = DEFAULTED;
+        if (desc->option == mi_option_verbose && desc->value == 0) {
+          // if the 'mimalloc_verbose' env var has a bogus value we'd never know
+          // (since the value defaults to 'off') so in that case briefly enable verbose
+          desc->value = 1;
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+          desc->value = 0;
+        }
+        else {
+          _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+        }
      }
    }
    mi_assert_internal(desc->init != UNINIT);
--- a/src/os.c
+++ b/src/os.c
@ -26,16 +26,20 @@ terms of the MIT license. A copy of the license can be found in the file
 #pragma warning(disable:4996)  // strerror
 #endif

+#if defined(__wasi__)
+#define MI_USE_SBRK
+#endif

 #if defined(_WIN32)
 #include <windows.h>
 #elif defined(__wasi__)
-// stdlib.h is all we need, and has already been included in mimalloc.h
+#include <unistd.h>    // sbrk
 #else
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
 #if defined(__linux__)
 #include <features.h>
+#include <fcntl.h>
 #if defined(__GLIBC__)
 #include <linux/mman.h> // linux mmap flags
 #else
@ -48,9 +52,13 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <mach/vm_statistics.h>
 #endif
 #endif
-#if defined(__HAIKU__)
-#define madvise posix_madvise
-#define MADV_DONTNEED POSIX_MADV_DONTNEED
+#if defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/param.h>
+#if __FreeBSD_version >= 1200000
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#endif
+#include <sys/sysctl.h>
 #endif
 #endif

@ -80,6 +88,7 @@ static void* mi_align_down_ptr(void* p, size_t alignment) {
  return (void*)_mi_align_down((uintptr_t)p, alignment);
 }

+
 // page size (initialized properly in `os_init`)
 static size_t os_page_size = 4096;

@ -89,30 +98,40 @@ static size_t os_alloc_granularity = 4096;
 // if non-zero, use large page allocation
 static size_t large_os_page_size = 0;

+// is memory overcommit allowed? 
+// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
+static bool os_overcommit = true;
+
+bool _mi_os_has_overcommit(void) {
+  return os_overcommit;
+}
+
 // OS (small) page size
 size_t _mi_os_page_size() {
  return os_page_size;
 }

 // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
-size_t _mi_os_large_page_size() {
+size_t _mi_os_large_page_size(void) {
  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
 }

+#if !defined(MI_USE_SBRK) && !defined(__wasi__)
 static bool use_large_os_page(size_t size, size_t alignment) {
  // if we have access, check the size and alignment requirements
  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
 }
+#endif

 // round to a good OS allocation size (bounded by max 12.5% waste)
 size_t _mi_os_good_alloc_size(size_t size) {
  size_t align_size;
-  if (size < 512*KiB) align_size = _mi_os_page_size();
-  else if (size < 2*MiB) align_size = 64*KiB;
-  else if (size < 8*MiB) align_size = 256*KiB;
-  else if (size < 32*MiB) align_size = 1*MiB;
-  else align_size = 4*MiB;
+  if (size < 512*MI_KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MI_MiB) align_size = 64*MI_KiB;
+  else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
+  else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
+  else align_size = 4*MI_MiB;
  if (mi_unlikely(size >= (SIZE_MAX - align_size))) return size; // possible overflow?
  return _mi_align_up(size, align_size);
 }
@ -175,7 +194,9 @@ static bool mi_win_enable_large_os_pages()
  return (ok!=0);
 }

-void _mi_os_init(void) {
+void _mi_os_init(void) 
+{
+  os_overcommit = false;
  // get the page size
  SYSTEM_INFO si;
  GetSystemInfo(&si);
@ -210,10 +231,36 @@ void _mi_os_init(void) {
 }
 #elif defined(__wasi__)
 void _mi_os_init() {
-  os_page_size = 0x10000; // WebAssembly has a fixed page size: 64KB
+  os_overcommit = false;
+  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
  os_alloc_granularity = 16;
 }
+
+#else  // generic unix
+
+static void os_detect_overcommit(void) {
+#if defined(__linux__)
+  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+	if (fd < 0) return;
+  char buf[32];
+  ssize_t nread = read(fd, &buf, sizeof(buf));
+	close(fd);
+  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+  if (nread >= 1) {
+    os_overcommit = (buf[0] == '0' || buf[0] == '1');
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }  
 #else
+  // default: overcommit is true  
+#endif
+}
+
 void _mi_os_init() {
  // get the page size
  long result = sysconf(_SC_PAGESIZE);
@ -221,14 +268,26 @@ void _mi_os_init() {
    os_page_size = (size_t)result;
    os_alloc_granularity = os_page_size;
  }
-  large_os_page_size = 2*MiB; // TODO: can we query the OS for this?
+  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  os_detect_overcommit();
+}
+#endif
+
+
+#if defined(MADV_NORMAL)
+static int mi_madvise(void* addr, size_t length, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, length, advice);
+  #endif
 }
 #endif


 /* -----------------------------------------------------------
-  Raw allocation on Windows (VirtualAlloc) and Unix's (mmap).
----------------------------------------------------------- */
+  free memory
+-------------------------------------------------------------- */

 static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
 {
@ -236,8 +295,8 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
  bool err = false;
 #if defined(_WIN32)
  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
-#elif defined(__wasi__)
-  err = 0; // WebAssembly's heap cannot be shrunk
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+  err = 0; // sbrk heap cannot be shrunk
 #else
  err = (munmap(addr, size) == -1);
 #endif
@ -252,33 +311,47 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
  }
 }

+#if !defined(MI_USE_SBRK) && !defined(__wasi__)
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+#endif
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc) 
+-------------------------------------------------------------- */

 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
 #if (MI_INTPTR_SIZE >= 8)
-  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
-  void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
-    void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
-    if (p != NULL) return p;
-    DWORD err = GetLastError();
-    if (err != ERROR_INVALID_ADDRESS &&   // If linked with multiple instances, we may have tried to allocate at an already allocated area (#210)
-        err != ERROR_INVALID_PARAMETER) { // Windows7 instability (#230)
-      return NULL;
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      // for robustness always fall through in case of an error
+      /*
+      DWORD err = GetLastError();
+      if (err != ERROR_INVALID_ADDRESS &&   // If linked with multiple instances, we may have tried to allocate at an already allocated area (#210)
+          err != ERROR_INVALID_PARAMETER) { // Windows7 instability (#230)
+        return NULL;
+      }
+      */
+      _mi_warning_message("unable to allocate hinted aligned OS memory (%zu bytes, error code: %x, address: %p, alignment: %d, flags: %x)\n", size, GetLastError(), hint, try_alignment, flags);
    }
-    // fall through
  } 
 #endif
 #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
    MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
    param.Type = MemExtendedParameterAddressRequirements;
    param.Pointer = &reqs;
-    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: %x, address: %p, alignment: %d, flags: %x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
  }
 #endif
  // last resort
@ -287,11 +360,11 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment

 static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
  mi_assert_internal(!(large_only && !allow_large));
-  static _Atomic(uintptr_t) large_page_try_ok; // = 0;
+  static _Atomic(size_t) large_page_try_ok; // = 0;
  void* p = NULL;
  if ((large_only || use_large_os_page(size, try_alignment))
      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
    if (!large_only && try_ok > 0) {
      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
@ -318,39 +391,124 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
  return p;
 }

+/* -----------------------------------------------------------
+  Raw allocation using `sbrk` or `wasm_memory_grow`
+-------------------------------------------------------------- */
+
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+#if defined(MI_USE_SBRK) 
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size); 
+    #endif
+    return p;
+  }
 #elif defined(__wasi__)
-static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
-  uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
-  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
-  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
-  mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
-  if (alloc_size < size) return NULL;
-  if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+                            : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;     
+    return (void*)(base * _mi_os_page_size());    
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_heap_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS) 
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+        void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+        alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+        base = mi_memory_grow(alloc_size);        
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+        // another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+        // space after alignment. Give up (and waste the space as we cannot shrink :-( )
+        // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+        p = NULL;
+      }
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);    
    errno = ENOMEM;
    return NULL;
  }
-  return (void*)aligned_base;
+  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
 }
-#else
+
+/* -----------------------------------------------------------
+  Raw allocation on Unix's (mmap)
+-------------------------------------------------------------- */
+#else 
 #define MI_OS_USE_MMAP
 static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
-  void* p = NULL;
-  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
-  // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
-  void* hint;
-  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
-    p = mmap(hint,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
+  MI_UNUSED(try_alignment);  
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      flags |= MAP_ALIGNED(n);
+      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
  }
-  #else
-  UNUSED(try_alignment);
-  UNUSED(mi_os_get_aligned_hint);
  #endif
-  if (p==NULL) {
-    p = mmap(addr,size,protect_flags,flags,fd,0);
-    if (p==MAP_FAILED) p = NULL;
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
  }
-  return p;
+  #endif
+  // regular mmap
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;  
+  // failed to allocate
+  return NULL;
 }

 static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
@ -361,28 +519,24 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  #if !defined(MAP_NORESERVE)
  #define MAP_NORESERVE  0
  #endif
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
  int fd = -1;
-  #if defined(MAP_ALIGNED)  // BSD
-  if (try_alignment > 0) {
-    size_t n = mi_bsr(try_alignment);
-    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
-      flags |= MAP_ALIGNED(n);
-    }
-  }
-  #endif
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }  
  #if defined(PROT_MAX)
  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
  #endif
  #if defined(VM_MAKE_TAG)
  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
  int os_tag = (int)mi_option_get(mi_option_os_tag);
-  if (os_tag < 100 || os_tag > 255) os_tag = 100;
+  if (os_tag < 100 || os_tag > 255) { os_tag = 100; }
  fd = VM_MAKE_TAG(os_tag);
  #endif
+  // huge page allocation
  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static _Atomic(uintptr_t) large_page_try_ok; // = 0;
-    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
    if (!large_only && try_ok > 0) {
      // If the OS is not configured for large OS pages, or the user does not have
      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
@ -401,7 +555,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
      #endif
      #ifdef MAP_HUGE_1GB
      static bool mi_huge_pages_available = true;
-      if ((size % GiB) == 0 && mi_huge_pages_available) {
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
        lflags |= MAP_HUGE_1GB;
      }
      else
@ -428,37 +582,39 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
        #endif
        if (large_only) return p;
        if (p == NULL) {
-          mi_atomic_store_release(&large_page_try_ok, (uintptr_t)10);  // on error, don't try again for the next N allocations
+          mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
        }
      }
    }
  }
+  // regular allocation
  if (p == NULL) {
    *is_large = false;
    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
-    #if defined(MADV_HUGEPAGE)
-    // Many Linux systems don't allow MAP_HUGETLB but they support instead
-    // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
-    // though since properly aligned allocations will already use large pages if available
-    // in that case -- in particular for our large regions (in `memory.c`).
-    // However, some systems only allow THP if called with explicit `madvise`, so
-    // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
-    if (allow_large && use_large_os_page(size, try_alignment)) {
-      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
-        *is_large = true; // possibly
-      };
-    }
-    #endif
-    #if defined(__sun)
-    if (allow_large && use_large_os_page(size, try_alignment)) {
-      struct memcntl_mha cmd = {0};
-      cmd.mha_pagesize = large_os_page_size;
-      cmd.mha_cmd = MHA_MAPSIZE_VA;
-      if (memcntl(p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
-        *is_large = true;
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
+          *is_large = true; // possibly
+        };
      }
+      #elif defined(__sun)
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+        struct memcntl_mha cmd = {0};
+        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_cmd = MHA_MAPSIZE_VA;
+        if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+          *is_large = true;
+        }
+      }      
+      #endif
    }
-    #endif
  }
  if (p == NULL) {
    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
@ -468,8 +624,8 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 #endif

 // On 64-bit systems, we can do efficient aligned allocation by using
-// the 4TiB to 30TiB area to allocate them.
-#if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
+// the 2TiB to 30TiB area to allocate them.
+#if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || defined(MI_OS_USE_MMAP))
 static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;

 // Return a 4MiB aligned address that is probably available.
@ -479,47 +635,52 @@ static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
 // (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses 
 //  in the middle of the 2TiB - 6TiB address range (see issue #372))

-#define KK_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define KK_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define KK_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)

 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) 
 {
-  if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
-  if (size > 1*GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(KK_HINT_AREA / 1<<30) = 1/4096.
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
  #if (MI_SECURE>0)
  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
  #endif

  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > KK_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = KK_HINT_BASE;
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % KK_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
    #endif
    uintptr_t expected = hint + size;
    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > KK_HINT_MAX but that is ok, it is a hint after all
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
  }
  if (hint%try_alignment != 0) return NULL;
  return (void*)hint;
 }
+#elif defined(__wasi__) || defined(MI_USE_SBRK)
+// no need for mi_os_get_aligned_hint
 #else
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
-  UNUSED(try_alignment); UNUSED(size);
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
  return NULL;
 }
 #endif

+/* -----------------------------------------------------------
+   Primitive allocation from the OS.
+-------------------------------------------------------------- */

-// Primitive allocation from the OS.
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
 static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  if (size == 0) return NULL;
  if (!commit) allow_large = false;
+  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning

  void* p = NULL;
  /*
@ -536,9 +697,10 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo
    int flags = MEM_RESERVE;
    if (commit) flags |= MEM_COMMIT;
    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
-  #elif defined(__wasi__)
+  #elif defined(MI_USE_SBRK) || defined(__wasi__)
+    MI_UNUSED(allow_large);
    *is_large = false;
-    p = mi_wasm_heap_grow(size, try_alignment);
+    p = mi_heap_grow(size, try_alignment);
  #else
    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
@ -593,6 +755,10 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
        mi_os_mem_free(p, over_size, commit, stats);
        void* aligned_p = mi_align_up_ptr(p, alignment);
        p = mi_win_virtual_alloc(aligned_p, size, alignment, flags, false, allow_large, is_large);
+        if (p != NULL) {
+          _mi_stat_increase(&stats->reserved, size);
+          if (commit) { _mi_stat_increase(&stats->committed, size); }
+        }
        if (p == aligned_p) break; // success!
        if (p != NULL) { // should not happen?
          mi_os_mem_free(p, size, commit, stats);
@ -602,9 +768,9 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
    }
 #else
    // overallocate...
-    p = mi_os_mem_alloc(over_size, alignment, commit, false, is_large, stats);
+    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
    if (p == NULL) return NULL;
-    // and selectively unmap parts around the over-allocated area.
+    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
    void* aligned_p = mi_align_up_ptr(p, alignment);
    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
@ -612,7 +778,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
-    // we can return the aligned pointer on `mmap` systems
+    // we can return the aligned pointer on `mmap` (and sbrk) systems
    p = aligned_p;
 #endif
  }
@ -626,7 +792,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
 ----------------------------------------------------------- */

 void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  if (size == 0) return NULL;
  size = _mi_os_good_alloc_size(size);
@ -635,7 +801,7 @@ void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
 }

 void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  if (size == 0 || p == NULL) return;
  size = _mi_os_good_alloc_size(size);
@ -648,7 +814,7 @@ void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {

 void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
 {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  if (size == 0) return NULL;
  size = _mi_os_good_alloc_size(size);
  alignment = _mi_align_up(alignment, _mi_os_page_size());
@ -699,7 +865,7 @@ static void mi_mprotect_hint(int err) {
                        "  > sudo sysctl -w vm.max_map_count=262144\n");
  }
 #else
-  UNUSED(err);
+  MI_UNUSED(err);
 #endif
 }

@ -723,8 +889,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ

  #if defined(_WIN32)
  if (commit) {
-    // if the memory was already committed, the call succeeds but it is not zero'd
-    // *is_zero = true;
+    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
    err = (p == start ? 0 : GetLastError());
  }
@ -734,26 +899,43 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  }
  #elif defined(__wasi__)
  // WebAssembly guests can't control memory protection
-  #elif defined(MAP_FIXED)
-  if (!commit) {
-    // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
-    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
-    if (p != start) { err = errno; }
-  }
-  else {
-    // for commit, just change the protection
+  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
+  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
+  if (commit) {
+    // commit: just change the protection
    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
    if (err != 0) { err = errno; }
-    #if defined(MADV_FREE_REUSE)
-      while ((err = madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
-    #endif
+  } 
+  else {
+    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
+    const int fd = mi_unix_mmap_fd();
+    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+    if (p != start) { err = errno; }
  }
  #else
-  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
-  if (err != 0) { err = errno; }
+  // Linux, macOSX and others.
+  if (commit) {
+    // commit: ensure we can access the area    
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  } 
+  else {
+    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
+    err = madvise(start, csize, MADV_DONTNEED);
+    #else
+    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
+    err = mprotect(start, csize, PROT_NONE);
+    if (err != 0) { err = errno; }
+    #endif
+    //#if defined(MADV_FREE_REUSE)
+    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
+    //#endif
+  }
  #endif
  if (err != 0) {
-    _mi_warning_message("%s error: start: %p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
+    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
    mi_mprotect_hint(err);
  }
  mi_assert_internal(err == 0);
@ -761,13 +943,13 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
 }

 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
 }

 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  bool is_zero;
  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
@ -808,27 +990,22 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
  if (p != start) return false;
 #else
 #if defined(MADV_FREE)
-  #if defined(MADV_FREE_REUSABLE)
-    #define KK_MADV_FREE_INITIAL  MADV_FREE_REUSABLE
-  #else
-    #define KK_MADV_FREE_INITIAL  MADV_FREE
-  #endif
-  static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(KK_MADV_FREE_INITIAL);
+  static _Atomic(size_t) advice = ATOMIC_VAR_INIT(MADV_FREE);
  int oadvice = (int)mi_atomic_load_relaxed(&advice);
  int err;
-  while ((err = madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
-  if (err != 0 && errno == EINVAL && oadvice == KK_MADV_FREE_INITIAL) {  
-    // if MADV_FREE/MADV_FREE_REUSABLE is not supported, fall back to MADV_DONTNEED from now on
-    mi_atomic_store_release(&advice, (uintptr_t)MADV_DONTNEED);
-    err = madvise(start, csize, MADV_DONTNEED);
+  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {  
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = mi_madvise(start, csize, MADV_DONTNEED);
  }
 #elif defined(__wasi__)
  int err = 0;
 #else
-  int err = madvise(start, csize, MADV_DONTNEED);
+  int err = mi_madvise(start, csize, MADV_DONTNEED);
 #endif
  if (err != 0) {
-    _mi_warning_message("madvise reset error: start: %p, csize: 0x%x, errno: %i\n", start, csize, errno);
+    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
  }
  //mi_assert(err == 0);
  if (err != 0) return false;
@ -841,7 +1018,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
 bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
    return _mi_os_decommit(addr, size, stats);
@ -852,7 +1029,7 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
 }

 bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
-  UNUSED(tld_stats);
+  MI_UNUSED(tld_stats);
  mi_stats_t* stats = &_mi_stats_main;
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
@ -887,7 +1064,7 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
  if (err != 0) { err = errno; }
 #endif
  if (err != 0) {
-    _mi_warning_message("mprotect error: start: %p, csize: 0x%x, err: %i\n", start, csize, err);
+    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
    mi_mprotect_hint(err);
  }
  return (err == 0);
@ -928,12 +1105,12 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
 Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE  (GiB)
+#define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)

 #if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 {
-  mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(size%MI_GiB == 0);
  mi_assert_internal(addr != NULL);
  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;

@ -964,7 +1141,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
    else {
      // fall back to regular large pages
      mi_huge_pages_available = false; // don't try further huge pages
-      _mi_warning_message("unable to allocate using huge (1gb) pages, trying large (2mb) pages instead (status 0x%lx)\n", err);
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
    }
  }
  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
@ -974,7 +1151,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
  }
  #else
-    UNUSED(numa_node);
+    MI_UNUSED(numa_node);
  #endif
  // otherwise use regular virtual alloc on older windows
  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
@ -991,30 +1168,30 @@ static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, cons
 }
 #else
 static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
-  UNUSED(start); UNUSED(len); UNUSED(mode); UNUSED(nmask); UNUSED(maxnode); UNUSED(flags);
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
  return 0;
 }
 #endif
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(size%MI_GiB == 0);
  bool is_large = true;
  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
  if (p == NULL) return NULL;
  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
-    uintptr_t numa_mask = (1UL << numa_node);
+    unsigned long numa_mask = (1UL << numa_node);
    // TODO: does `mbind` work correctly for huge OS pages? should we
    // use `set_mempolicy` before calling mmap instead?
    // see: <https://lkml.org/lkml/2017/2/9/875>
    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
    if (err != 0) {
-      _mi_warning_message("failed to bind huge (1gb) pages to numa node %d: %s\n", numa_node, strerror(errno));
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
    }
  }
  return p;
 }
 #else
 static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
-  UNUSED(addr); UNUSED(size); UNUSED(numa_node);
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
  return NULL;
 }
 #endif
@ -1050,7 +1227,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
 }
 #else
 static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
-  UNUSED(pages);
+  MI_UNUSED(pages);
  if (total_size != NULL) *total_size = 0;
  return NULL;
 }
@ -1104,8 +1281,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
    }
  }
  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
-  if (pages_reserved != NULL) *pages_reserved = page;
-  if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
+  if (pages_reserved != NULL) { *pages_reserved = page; }
+  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
  return (page == 0 ? NULL : start);
 }

@ -1117,6 +1294,7 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
  while (size >= MI_HUGE_OS_PAGE_SIZE) {
    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
    size -= MI_HUGE_OS_PAGE_SIZE;
+    base += MI_HUGE_OS_PAGE_SIZE;
  }
 }

@ -1193,6 +1371,35 @@ static size_t mi_os_numa_node_countx(void) {
  }
  return (node+1);
 }
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+static size_t mi_os_numa_nodex(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+#elif defined(__DragonFly__)
+static size_t mi_os_numa_nodex(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
 #else
 static size_t mi_os_numa_nodex(void) {
  return 0;
@ -1222,7 +1429,7 @@ size_t _mi_os_numa_node_count_get(void) {
 }

 int _mi_os_numa_node_get(mi_os_tld_t* tld) {
-  UNUSED(tld);
+  MI_UNUSED(tld);
  size_t numa_count = _mi_os_numa_node_count();
  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
  // never more than the node count and >= 0
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -53,7 +53,7 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
 // i.e. byte size == `wsize*sizeof(void*)`.
-extern inline uint8_t _mi_bin(size_t size) {
+static inline uint8_t mi_bin(size_t size) {
  size_t wsize = _mi_wsize_from_size(size);
  uint8_t bin;
  if (wsize <= 1) {
@ -98,6 +98,10 @@ extern inline uint8_t _mi_bin(size_t size) {
  Queue of pages with free blocks
 ----------------------------------------------------------- */

+uint8_t _mi_bin(size_t size) {
+  return mi_bin(size);
+}
+
 size_t _mi_bin_size(uint8_t bin) {
  return _mi_heap_empty.pages[bin].block_size;
 }
@ -105,7 +109,7 @@ size_t _mi_bin_size(uint8_t bin) {
 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
-    return _mi_bin_size(_mi_bin(size));
+    return _mi_bin_size(mi_bin(size));
  }
  else {
    return _mi_align_up(size,_mi_os_page_size());
@ -134,7 +138,7 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif

 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
  mi_heap_t* heap = mi_page_heap(page);
  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
@ -144,7 +148,7 @@ static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
 }

 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
  mi_assert_internal(bin <= MI_BIN_FULL);
  mi_page_queue_t* pq = &heap->pages[bin];
  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
@ -177,9 +181,9 @@ static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_que
  }
  else {
    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
-    uint8_t bin = _mi_bin(size);
+    uint8_t bin = mi_bin(size);
    const mi_page_queue_t* prev = pq - 1;
-    while( bin == _mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
      prev--;
    }
    start = 1 + _mi_wsize_from_size(prev->block_size);
--- a/src/page.c
+++ b/src/page.c
@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* -----------------------------------------------------------
  The core of the allocator. Every segment contains
-  pages of a {certain block size. The main function
+  pages of a certain block size. The main function
  exported is `mi_malloc_generic`.
 ----------------------------------------------------------- */

@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file

 // Index a block in a page
 static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
-  UNUSED(page);
+  MI_UNUSED(page);
  mi_assert_internal(page != NULL);
  mi_assert_internal(i <= page->reserved);
  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
@ -84,9 +84,10 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));

  #if MI_DEBUG>3 // generally too expensive to check this
-  if (page->flags.is_zero) {
-    for(mi_block_t* block = page->free; block != NULL; mi_block_next(page,block)) {
-      mi_assert_expensive(mi_mem_is_zero(block + 1, page->block_size - sizeof(mi_block_t)));
+  if (page->is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
    }
  }
  #endif
@ -385,7 +386,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 // Note: called from `mi_free` and benchmarks often
 // trigger this due to freeing everything and then
 // allocating again so careful when changing this.
-void _mi_page_retire(mi_page_t* page) {
+void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
  mi_assert_internal(page != NULL);
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(mi_page_all_free(page));
@ -458,7 +459,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
 #define MI_MIN_SLICES       (2)

 static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
-  UNUSED(stats);
+  MI_UNUSED(stats);
  #if (MI_SECURE<=2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
@ -516,7 +517,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co

 static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
 {
-  UNUSED(stats);
+  MI_UNUSED(stats);
  #if (MI_SECURE <= 2)
  mi_assert_internal(page->free == NULL);
  mi_assert_internal(page->local_free == NULL);
@ -573,13 +574,16 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
  // calculate the extend count
  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
  size_t extend = page->reserved - page->capacity;
-  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
-  if (max_extend < MI_MIN_EXTEND) max_extend = MI_MIN_EXTEND;
+  mi_assert_internal(extend > 0);

+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
+  mi_assert_internal(max_extend > 0);
+    
  if (extend > max_extend) {
    // ensure we don't touch memory beyond the page to reduce page commit.
    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
-    extend = (max_extend==0 ? 1 : max_extend);
+    extend = max_extend;
  }

  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
@ -620,7 +624,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  page->keys[0] = _mi_heap_random_next(heap);
  page->keys[1] = _mi_heap_random_next(heap);
  #endif
+  #if MI_DEBUG > 0
+  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
+  #else
  page->is_zero = page->is_zero_init;
+  #endif

  mi_assert_internal(page->capacity == 0);
  mi_assert_internal(page->free == NULL);
@ -760,7 +768,7 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // that frees the block can free the whole page and segment directly.
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
  size_t block_size = _mi_os_good_alloc_size(size);
-  mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE);
  mi_page_t* page = mi_page_fresh_alloc(heap,NULL,block_size);
  if (page != NULL) {
    const size_t bsize = mi_page_block_size(page);  // note: not `mi_page_usable_block_size` as `size` includes padding already
--- a/src/random.c
+++ b/src/random.c
@ -4,6 +4,10 @@ This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for syscall() on Linux
+#endif
+
 #include "mimalloc.h"
 #include "mimalloc-internal.h"

@ -156,7 +160,8 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
 /* ----------------------------------------------------------------------------
 To initialize a fresh random context we rely on the OS:
 - Windows     : BCryptGenRandom (or RtlGenRandom)
- osX,bsd,wasi: arc4random_buf
+- macOS       : CCRandomGenerateBytes, arc4random_buf
+- bsd,wasi    : arc4random_buf
 - Linux       : getrandom,/dev/urandom
 If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
 -----------------------------------------------------------------------------*/
@ -164,7 +169,8 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 #if defined(_WIN32)

 #if !defined(MI_USE_RTLGENRANDOM)
-// We prefer BCryptGenRandom over RtlGenRandom
+// We prefer to use BCryptGenRandom instead of RtlGenRandom but it can lead to a deadlock 
+// under the VS debugger when using dynamic overriding.
 #pragma comment (lib,"bcrypt.lib")
 #include <bcrypt.h>
 static bool os_random_buf(void* buf, size_t buf_len) {
@ -186,16 +192,35 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 }
 #endif

-#elif defined(ANDROID) || defined(XP_DARWIN) || defined(__APPLE__) || defined(__DragonFly__) || \
+#elif defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#include <CommonCrypto/CommonRandom.h>
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>      
+    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+  #else
+    // fall back on older macOS
+    arc4random_buf(buf, buf_len);
+    return true;
+  #endif
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) || defined(__wasi__)
+      defined(__sun) // todo: what to use with __wasi__?
 #include <stdlib.h>
 static bool os_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
  return true;
 }
-#elif defined(__linux__)
+#elif defined(__linux__) || defined(__HAIKU__)
+#if defined(__linux__)
 #include <sys/syscall.h>
+#endif
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@ -214,7 +239,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
  if (mi_atomic_load_acquire(&no_getrandom)==0) {
    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
    if (ret >= 0) return (buf_len == (size_t)ret);
-    if (ret != ENOSYS) return false;
+    if (errno != ENOSYS) return false;
    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
  }
 #endif
@ -251,8 +276,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif

-uintptr_t _os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
  
  #if defined(_WIN32)
    LARGE_INTEGER pcount;
@ -280,8 +305,10 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
  if (!os_random_buf(key, sizeof(key))) {
    // if we fail to get random data from the OS, we fall back to a
    // weak random source based on the current time
+    #if !defined(__wasi__)
    _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = _os_random_weak(0);
+    #endif
+    uintptr_t x = _mi_os_random_weak(0);
    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
      x = _mi_random_shuffle(x);
      ((uint32_t*)key)[i] = (uint32_t)x;
--- a/src/region.c
+++ b/src/region.c
@ -40,7 +40,7 @@ Possible issues:
 #include "bitmap.h"

 // Internal raw OS interface
-size_t  _mi_os_large_page_size();
+size_t  _mi_os_large_page_size(void);
 bool    _mi_os_protect(void* addr, size_t size);
 bool    _mi_os_unprotect(void* addr, size_t size);
 bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
@ -57,9 +57,9 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo

 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 64KiB for the region map 
+#define MI_HEAP_REGION_MAX_SIZE    (256 * MI_GiB)  // 64KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * MI_GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif
@ -74,7 +74,7 @@ void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, boo

 // Region info 
 typedef union mi_region_info_u {
-  uintptr_t value;      
+  size_t value;      
  struct {
    bool  valid;        // initialized?
    bool  is_large:1;   // allocated in fixed large/huge OS pages
@ -87,21 +87,21 @@ typedef union mi_region_info_u {
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  _Atomic(uintptr_t)        info;        // mi_region_info_t.value
+  _Atomic(size_t)           info;        // mi_region_info_t.value
  _Atomic(void*)            start;       // start of the memory area 
  mi_bitmap_field_t         in_use;      // bit per in-use block
  mi_bitmap_field_t         dirty;       // track if non-zero per block
  mi_bitmap_field_t         commit;      // track if committed per block
  mi_bitmap_field_t         reset;       // track if reset per block
-  _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
-  uintptr_t                 padding;     // round to 8 fields
+  _Atomic(size_t)           arena_memid; // if allocated from a (huge page) arena
+  _Atomic(size_t)           padding;     // round to 8 fields (needs to be atomic for msvc, see issue #508)
 } mem_region_t;

 // The region map
 static mem_region_t regions[MI_REGION_MAX];

 // Allocated regions
-static _Atomic(uintptr_t) regions_count; // = 0;        
+static _Atomic(size_t) regions_count; // = 0;        


 /* ----------------------------------------------------------------------------
@ -186,21 +186,21 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
  mi_assert_internal(!region_large || region_commit);

  // claim a fresh slot
-  const uintptr_t idx = mi_atomic_increment_acq_rel(&regions_count);
+  const size_t idx = mi_atomic_increment_acq_rel(&regions_count);
  if (idx >= MI_REGION_MAX) {
    mi_atomic_decrement_acq_rel(&regions_count);
    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
-    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, GiB));
+    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, MI_GiB));
    return false;
  }

  // allocated, initialize and claim the initial blocks
  mem_region_t* r = &regions[idx];
  r->arena_memid  = arena_memid;
-  mi_atomic_store_release(&r->in_use, (uintptr_t)0);
+  mi_atomic_store_release(&r->in_use, (size_t)0);
  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_store_release(&r->reset, (uintptr_t)0);
+  mi_atomic_store_release(&r->reset, (size_t)0);
  *bit_idx = 0;
  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
  mi_atomic_store_ptr_release(void,&r->start, start);
@ -441,7 +441,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re

    // and unclaim
    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
-    mi_assert_internal(all_unclaimed); UNUSED(all_unclaimed);
+    mi_assert_internal(all_unclaimed); MI_UNUSED(all_unclaimed);
  }
 }

@ -451,21 +451,21 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
 -----------------------------------------------------------------------------*/
 void _mi_mem_collect(mi_os_tld_t* tld) {
  // free every region that has no segments in use.
-  uintptr_t rcount = mi_atomic_load_relaxed(&regions_count);
+  size_t rcount = mi_atomic_load_relaxed(&regions_count);
  for (size_t i = 0; i < rcount; i++) {
    mem_region_t* region = &regions[i];
    if (mi_atomic_load_relaxed(&region->info) != 0) {
      // if no segments used, try to claim the whole region
-      uintptr_t m = mi_atomic_load_relaxed(&region->in_use);
+      size_t m = mi_atomic_load_relaxed(&region->in_use);
      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
      if (m == 0) {
        // on success, free the whole region
        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
-        uintptr_t commit = mi_atomic_load_relaxed(&regions[i].commit);
-        memset(&regions[i], 0, sizeof(mem_region_t));
+        size_t commit = mi_atomic_load_relaxed(&regions[i].commit);
+        memset((void*)&regions[i], 0, sizeof(mem_region_t));  // cast to void* to avoid atomic warning
        // and release the whole region
-        mi_atomic_store_release(&region->info, (uintptr_t)0);
+        mi_atomic_store_release(&region->info, (size_t)0);
        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
          _mi_abandoned_await_readers(); // ensure no pending reads
          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
--- a/src/segment.c
+++ b/src/segment.c
@ -17,14 +17,14 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_

 /* --------------------------------------------------------------------------------
  Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  We allocate pages inside bigger "segments" (4MiB on 64-bit). This is to avoid
  splitting VMA's on Linux and reduce fragmentation on other OS's.
  Each thread owns its own segments.

  Currently we have:
-  - small pages (64kb), 64 in one segment
-  - medium pages (512kb), 8 in one segment
-  - large pages (4mb), 1 in one segment
+  - small pages (64KiB), 64 in one segment
+  - medium pages (512KiB), 8 in one segment
+  - large pages (4MiB), 1 in one segment
  - huge blocks > MI_LARGE_OBJ_SIZE_MAX become large segment with 1 page

  In any case the memory for a segment is virtual and usually committed on demand.
@ -579,7 +579,10 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
  mi_assert_internal(segment_size >= required);

  // Initialize parameters
-  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM &&          // don't delay for large objects
+                              !_mi_os_has_overcommit() &&             // never delay on overcommit systems
+                              _mi_current_thread_count() > 2 &&       // do not delay for the first N threads
+                              tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
  const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
  bool commit = eager; // || (page_kind >= MI_PAGE_LARGE);
  bool pages_still_good = false;
@ -655,8 +658,9 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
    memset((uint8_t*)segment + ofs, 0, info_size - ofs);

    // initialize pages info
-    for (uint8_t i = 0; i < capacity; i++) {
-      segment->pages[i].segment_idx = i;
+    for (size_t i = 0; i < capacity; i++) {
+      mi_assert_internal(i <= 255);
+      segment->pages[i].segment_idx = (uint8_t)i;
      segment->pages[i].is_reset = false;
      segment->pages[i].is_committed = commit;
      segment->pages[i].is_zero_init = is_zero;
@ -695,7 +699,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
 }

 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  UNUSED(force);
+  MI_UNUSED(force);
  mi_assert(segment != NULL);
  // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
  bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
@ -896,13 +900,13 @@ static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // =
 static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL

 // Maintain these for debug purposes (these counts may be a bit off)
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_count; 
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_visited_count;
+static mi_decl_cache_align _Atomic(size_t)           abandoned_count; 
+static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;

 // We also maintain a count of current readers of the abandoned list
 // in order to prevent resetting/decommitting segment memory if it might
 // still be read.
-static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // = 0
+static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0

 // Push on the visited list
 static void mi_abandoned_visited_push(mi_segment_t* segment) {
@ -931,7 +935,7 @@ static bool mi_abandoned_visited_revisit(void)
  mi_tagged_segment_t afirst;
  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
  if (mi_tagged_segment_ptr(ts)==NULL) {
-    uintptr_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
    afirst = mi_tagged_segment(first, ts);
    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
      mi_atomic_add_relaxed(&abandoned_count, count);
@ -950,7 +954,7 @@ static bool mi_abandoned_visited_revisit(void)
  // and atomically prepend to the abandoned list
  // (no need to increase the readers as we don't access the abandoned segments)
  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
-  uintptr_t count;
+  size_t count;
  do {
    count = mi_atomic_load_relaxed(&abandoned_visited_count);
    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
@ -978,7 +982,7 @@ static void mi_abandoned_push(mi_segment_t* segment) {

 // Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
-  uintptr_t n;
+  size_t n;
  do {
    n = mi_atomic_load_acquire(&abandoned_readers);
    if (n != 0) mi_atomic_yield();
@ -1326,7 +1330,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
  // claim it and free
  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
  // paranoia: if this it the last reference, the cas should always succeed
-  uintptr_t expected_tid = 0;
+  size_t expected_tid = 0;
  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
    mi_block_set_next(page, block, page->free);
    page->free = block;
--- a/src/stats.c
+++ b/src/stats.c
@ -133,25 +133,29 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 // unit == 0: count as decimal
 // unit < 0 : count in binary
 static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
-  char buf[32];
+  char buf[32]; buf[0] = 0;  
  int  len = 32;
-  const char* suffix = (unit <= 0 ? " " : "b");
+  const char* suffix = (unit <= 0 ? " " : "B");
  const int64_t base = (unit == 0 ? 1000 : 1024);
  if (unit>0) n *= unit;

  const int64_t pos = (n < 0 ? -n : n);
  if (pos < base) {
-    snprintf(buf, len, "%d %s ", (int)n, suffix);
+    if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
+      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+    }
  }
  else {
-    int64_t divider = base;
-    const char* magnitude = "k";
-    if (pos >= divider*base) { divider *= base; magnitude = "m"; }
-    if (pos >= divider*base) { divider *= base; magnitude = "g"; }
+    int64_t divider = base;    
+    const char* magnitude = "K";
+    if (pos >= divider*base) { divider *= base; magnitude = "M"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "G"; }
    const int64_t tens = (n / (divider/10));
    const long whole = (long)(tens/10);
    const long frac1 = (long)(tens%10);
-    snprintf(buf, len, "%ld.%ld %s%s", whole, (frac1 < 0 ? -frac1 : frac1), magnitude, suffix);
+    char unitdesc[8];
+    snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
  }
  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
 }
@ -221,7 +225,7 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*


 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "current  ", "unit  ", "count  ");
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
 }

 #if MI_STAT>1
@ -323,7 +327,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
-  _mi_fprintf(out, arg, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
+  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
  
  mi_msecs_t elapsed;
  mi_msecs_t user_time;
@ -412,10 +416,14 @@ mi_msecs_t _mi_clock_now(void) {
 }
 #else
 #include <time.h>
-#ifdef CLOCK_REALTIME
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
 mi_msecs_t _mi_clock_now(void) {
  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else  
  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
 }
 #else
@ -479,7 +487,7 @@ static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msec
  *page_faults    = (size_t)info.PageFaultCount;  
 }

-#elif defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)
+#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/resource.h>
@ -520,6 +528,7 @@ static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msec
  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
    *peak_rss += mem.ram_size;
  }
+  *page_faults = 0;
 #elif defined(__APPLE__)
  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
  struct mach_task_basic_info info;