Merge branch 'dev' into dev-steal

2025-08-24 00:04:48 +03:00 · 2024-11-18 11:05:31 -08:00 · 2024-11-18 11:05:31 -08:00 · edc7ddd37c
commit edc7ddd37c
parent a7e7cbac89 9cae0d31cd
36 changed files with 1046 additions and 436 deletions
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -20,14 +20,36 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
  if (alignment > size) return false;
  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
-  #if MI_DEBUG_GUARDED
-  return false;
-  #else
  const size_t bsize = mi_good_size(size);
  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
-  #endif
 }

+#if MI_GUARDED
+static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept {
+  // use over allocation for guarded blocksl
+  mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX);
+  const size_t oversize = size + alignment - 1;
+  void* base = _mi_heap_malloc_guarded(heap, oversize, zero);
+  void* p = mi_align_up_ptr(base, alignment);
+  mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size);
+  mi_assert_internal(mi_usable_size(p) >= size);
+  mi_assert_internal(_mi_is_aligned(p, alignment));
+  return p;
+}
+
+static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
+  const size_t rate = heap->guarded_sample_rate;
+  heap->guarded_sample_rate = 0;
+  void* p = _mi_heap_malloc_zero(heap, size, zero);
+  heap->guarded_sample_rate = rate;
+  return p;
+}
+#else
+static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
+  return _mi_heap_malloc_zero(heap, size, zero);
+}
+#endif
+
 // Fallback aligned allocation that over-allocates -- split out for better codegen
 static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@ -48,6 +70,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
      return NULL;
    }
    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    // note: no guarded as alignment > 0
    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
    // zero afterwards as only the area from the aligned_p may be committed!
    if (p == NULL) return NULL;
@ -55,11 +78,11 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  else {
    // otherwise over-allocate
    oversize = size + alignment - 1;
-    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    p = mi_heap_malloc_zero_no_guarded(heap, oversize, zero);
    if (p == NULL) return NULL;
  }
  mi_page_t* page = _mi_ptr_page(p);
-  
+
  // .. and align within the allocation
  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
@ -68,6 +91,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  void* aligned_p = (void*)((uintptr_t)p + adjust);
  if (aligned_p != p) {
    mi_page_set_has_aligned(page, true);
+    #if MI_GUARDED
+    // set tag to aligned so mi_usable_size works with guard pages
+    if (adjust >= sizeof(mi_block_t)) {
+      mi_block_t* const block = (mi_block_t*)p;
+      block->next = MI_BLOCK_TAG_ALIGNED;
+    }
+    #endif
    _mi_padding_shrink(page, (mi_block_t*)p, adjust + size);
  }
  // todo: expand padding if overallocated ?
@ -76,8 +106,10 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
  mi_assert_internal(mi_usable_size(aligned_p)>=size);
  mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
-  #if !MI_DEBUG_GUARDED
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
+  #if MI_DEBUG > 1
+  mi_page_t* const apage = _mi_ptr_page(aligned_p);
+  void* unalign_p = _mi_page_ptr_unalign(apage, aligned_p);
+  mi_assert_internal(p == unalign_p);
  #endif

  // now zero the block if needed
@ -91,6 +123,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t

  if (p != aligned_p) {
    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
+    #if MI_GUARDED
+    mi_track_mem_defined(p, sizeof(mi_block_t));
+    #endif
  }
  return aligned_p;
 }
@ -100,27 +135,27 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t*
 {
  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
  // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { 
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) {
    #if MI_DEBUG > 0
    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
    #endif
    return NULL;
  }
-  
+
  // use regular allocation if it is guaranteed to fit the alignment constraints.
  // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist
  // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
-    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    void* p = mi_heap_malloc_zero_no_guarded(heap, size, zero);
    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;
    if mi_likely(is_aligned_or_null) {
      return p;
    }
    else {
      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
      mi_assert(false);
-      mi_free(p); 
+      mi_free(p);
    }
  }

@ -128,6 +163,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t*
  return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero);
 }

+
 // Primitive aligned allocation
 static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@ -138,12 +174,17 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
    #endif
    return NULL;
  }
-  
-  #if !MI_DEBUG_GUARDED
+
+  #if MI_GUARDED
+  if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) {
+    return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero);
+  }
+  #endif
+
  // try first if there happens to be a small block available with just the right alignment
  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-    const size_t padsize = size + MI_PADDING_SIZE;  
+    const size_t padsize = size + MI_PADDING_SIZE;
    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
    if mi_likely(page->free != NULL) {
      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
@ -160,7 +201,6 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
      }
    }
  }
-  #endif

  // fallback to generic aligned allocation
  return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero);
@ -313,3 +353,5 @@ mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t
 mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
  return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment);
 }
+
+
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -289,8 +289,8 @@ mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_r
  void  __libc_free(void* p)                            MI_FORWARD0(mi_free, p)
  void* __libc_memalign(size_t alignment, size_t size)  { return mi_memalign(alignment, size); }

-#elif defined(__GLIBC__) && defined(__linux__)
-  // forward __libc interface (needed for glibc-based Linux distributions)
+#elif defined(__linux__)
+  // forward __libc interface (needed for glibc-based and musl-based Linux distributions)
  void* __libc_malloc(size_t size)                      MI_FORWARD1(mi_malloc,size)
  void* __libc_calloc(size_t count, size_t size)        MI_FORWARD2(mi_calloc,count,size)
  void* __libc_realloc(void* p, size_t size)            MI_FORWARD2(mi_realloc,p,size)
--- a/src/alloc.c
+++ b/src/alloc.c
@ -31,22 +31,22 @@ terms of the MIT license. A copy of the license can be found in the file
 extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
-  
+
  // check the free list
  mi_block_t* const block = page->free;
  if mi_unlikely(block == NULL) {
    return _mi_malloc_generic(heap, size, zero, 0);
  }
  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
-  
+
  // pop from the free list
  page->free = mi_block_next(page, block);
  page->used++;
  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
  mi_assert_internal(page->block_size < MI_MAX_ALIGN_SIZE || _mi_is_aligned(block, MI_MAX_ALIGN_SIZE));
-  
+
  #if MI_DEBUG>3
-  if (page->free_is_zero && size > sizeof(*block)) { 
+  if (page->free_is_zero && size > sizeof(*block)) {
    mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block)));
  }
  #endif
@ -99,7 +99,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
    mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
    #endif
    mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
-    padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+    padding->canary = mi_ptr_encode_canary(page,block,page->keys);
    padding->delta  = (uint32_t)(delta);
    #if MI_PADDING_CHECK
    if (!mi_page_is_huge(page)) {
@ -121,10 +121,8 @@ extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t siz
  return _mi_page_malloc_zero(heap,page,size,true);
 }

-#if MI_DEBUG_GUARDED
-static mi_decl_restrict void* mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
-static inline bool mi_heap_malloc_use_guarded(size_t size, bool has_huge_alignment);
-static inline bool mi_heap_malloc_small_use_guarded(size_t size);
+#if MI_GUARDED
+mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
 #endif

 static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
@ -134,11 +132,13 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
  const uintptr_t tid = _mi_thread_id();
  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
  #endif
-  #if (MI_PADDING || MI_DEBUG_GUARDED)
+  #if (MI_PADDING || MI_GUARDED)
  if (size == 0) { size = sizeof(void*); }
  #endif
-  #if MI_DEBUG_GUARDED
-  if (mi_heap_malloc_small_use_guarded(size)) { return mi_heap_malloc_guarded(heap, size, zero); }
+  #if MI_GUARDED
+  if (mi_heap_malloc_use_guarded(heap,size)) {
+    return _mi_heap_malloc_guarded(heap, size, zero);
+  }
  #endif

  // get page in constant time, and allocate from it
@ -171,13 +171,15 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t si

 // The main allocation function
 extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
-  // fast path for small objects 
+  // fast path for small objects
  if mi_likely(size <= MI_SMALL_SIZE_MAX) {
    mi_assert_internal(huge_alignment == 0);
    return mi_heap_malloc_small_zero(heap, size, zero);
  }
-  #if MI_DEBUG_GUARDED
-  else if (mi_heap_malloc_use_guarded(size,huge_alignment>0)) { return mi_heap_malloc_guarded(heap, size, zero); }
+  #if MI_GUARDED
+  else if (huge_alignment==0 && mi_heap_malloc_use_guarded(heap,size)) {
+    return _mi_heap_malloc_guarded(heap, size, zero);
+  }
  #endif
  else {
    // regular allocation
@ -185,7 +187,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
    mi_track_malloc(p,size,zero);
-    
+
    #if MI_STAT>1
    if (p != NULL) {
      if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
@ -601,69 +603,73 @@ mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
  }
 }

-#if MI_DEBUG_GUARDED
-static inline bool mi_heap_malloc_small_use_guarded(size_t size) {
-  return (size <= (size_t)_mi_option_get_fast(mi_option_debug_guarded_max) 
-          && size >= (size_t)_mi_option_get_fast(mi_option_debug_guarded_min));
+#if MI_GUARDED
+// We always allocate a guarded allocation at an offset (`mi_page_has_aligned` will be true).
+// We then set the first word of the block to `0` for regular offset aligned allocations (in `alloc-aligned.c`)
+// and the first word to `~0` for guarded allocations to have a correct `mi_usable_size`
+
+static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) {
+  // TODO: we can still make padding work by moving it out of the guard page area
+  mi_page_t* const page = _mi_ptr_page(block);
+  mi_page_set_has_aligned(page, true);
+  block->next = MI_BLOCK_TAG_GUARDED;
+
+  // set guard page at the end of the block
+  mi_segment_t* const segment = _mi_page_segment(page);
+  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
+  const size_t os_page_size = _mi_os_page_size();
+  mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t));
+  if (block_size < obj_size + os_page_size + sizeof(mi_block_t)) {
+    // should never happen
+    mi_free(block);
+    return NULL;
+  }
+  uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size;
+  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
+  if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) {
+    _mi_os_protect(guard_page, os_page_size);
+  }
+  else {
+    _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", block, block_size);
+  }
+
+  // align pointer just in front of the guard page
+  size_t offset = block_size - os_page_size - obj_size;
+  mi_assert_internal(offset > sizeof(mi_block_t));
+  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
+    // give up to place it right in front of the guard page if the offset is too large for unalignment
+    offset = MI_BLOCK_ALIGNMENT_MAX;
+  }
+  void* p = (uint8_t*)block + offset;  
+  mi_track_align(block, p, offset, obj_size);
+  mi_track_mem_defined(block, sizeof(mi_block_t));
+  return p;
 }

-static inline bool mi_heap_malloc_use_guarded(size_t size, bool has_huge_alignment) {
-  return (!has_huge_alignment  // guarded pages do not work with huge aligments at the moment
-          && _mi_option_get_fast(mi_option_debug_guarded_max) > 0  // guarded must be enabled
-          && (mi_heap_malloc_small_use_guarded(size)
-              || ((mi_good_size(size) & (_mi_os_page_size() - 1)) == 0))  // page-size multiple are always guarded so we can have a correct `mi_usable_size`.
-         );
-}
-
-static mi_decl_restrict void* mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept
+mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept
 {
  #if defined(MI_PADDING_SIZE)
  mi_assert(MI_PADDING_SIZE==0);
  #endif
  // allocate multiple of page size ending in a guard page
-  const size_t obj_size  = _mi_align_up(size, MI_MAX_ALIGN_SIZE); // ensure minimal alignment requirement
+  // ensure minimal alignment requirement?
  const size_t os_page_size = _mi_os_page_size();
-  const size_t req_size  = _mi_align_up(obj_size + os_page_size, os_page_size);
-  void* const block = _mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
+  const size_t obj_size = (mi_option_is_enabled(mi_option_guarded_precise) ? size : _mi_align_up(size, MI_MAX_ALIGN_SIZE));
+  const size_t bsize    = _mi_align_up(_mi_align_up(obj_size, MI_MAX_ALIGN_SIZE) + sizeof(mi_block_t), MI_MAX_ALIGN_SIZE);
+  const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size);
+  mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */);
  if (block==NULL) return NULL;
-  mi_page_t* page = _mi_ptr_page(block);
-  mi_segment_t* segment = _mi_page_segment(page);
-
-  const size_t block_size = mi_page_block_size(page);  // must use `block_size` to match `mi_free_local`
-  void* const guard_page  = (uint8_t*)block + (block_size - os_page_size);
-  mi_assert_internal(_mi_is_aligned(guard_page, os_page_size));
-
-  // place block in front of the guard page
-  size_t offset = block_size - os_page_size - obj_size;
-  if (offset > MI_BLOCK_ALIGNMENT_MAX) {
-    // give up to place it right in front of the guard page if the offset is too large for unalignment
-    offset = MI_BLOCK_ALIGNMENT_MAX;
-  }
-  void* const p = (uint8_t*)block + offset;
-  mi_assert_internal(p>=block);
-
-  // set page flags
-  if (offset > 0) {
-    mi_page_set_has_aligned(page, true);
-  }
-
-  // set guard page
-  if (segment->allow_decommit) {
-    mi_page_set_has_guarded(page, true);
-    _mi_os_protect(guard_page, os_page_size);
-  }
-  else {
-    _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", p, size);
-  }
+  void* const p   = mi_block_ptr_set_guarded(block, obj_size);

  // stats
-  mi_track_malloc(p, size, zero);
-  #if MI_STAT>1
+  mi_track_malloc(p, size, zero);  
  if (p != NULL) {
    if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
+    #if MI_STAT>1
    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    #endif
+    _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1);
  }
-  #endif
  #if MI_DEBUG>3
  if (p != NULL && zero) {
    mi_assert_expensive(mi_mem_is_zero(p, size));
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@ -148,7 +148,7 @@ static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
 void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
 {
  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, 0);  // mark as abandoned for multi-thread free's
+  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
    mi_arena_segment_os_mark_abandoned(segment);
    return;
@ -237,7 +237,7 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_s
 static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
  const size_t max_arena = mi_arena_get_count();
  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
+  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
  // visit arena's (from the previous cursor)
  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
    // index wraps around
@ -266,11 +266,12 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_
            // pre-check if the bit is set
            size_t mask = ((size_t)1 << bit_idx);
            if mi_unlikely((field & mask) == mask) {
-              previous->bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, previous->bitmap_idx);
+              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
              if (segment != NULL) {
                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
+                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
                return segment;
              }
            }
--- a/src/arena.c
+++ b/src/arena.c
@ -289,7 +289,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
  MI_UNUSED_RELEASE(alignment);
-  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  mi_assert(alignment <= MI_SEGMENT_ALIGN);
  const size_t bcount = mi_block_count_of_size(size);
  const size_t arena_index = mi_arena_id_index(arena_id);
  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -35,9 +35,13 @@ typedef mi_bitmap_field_t*  mi_bitmap_t;
 typedef size_t mi_bitmap_index_t;

 // Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
 static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+  return mi_bitmap_index_create_ex(idx,bitidx);
 }

 // Get the field index from a bit index.
--- a/src/free.c
+++ b/src/free.c
@ -33,8 +33,8 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
  // checks
  if mi_unlikely(mi_check_is_double_free(page, block)) return;
  mi_check_padding(page, block);
-  if (track_stats) { mi_stat_free(page, block); }  
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_DEBUG_GUARDED
+  if (track_stats) { mi_stat_free(page, block); }
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN && !MI_GUARDED
  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
  #endif
  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
@ -69,21 +69,30 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
  return (mi_block_t*)((uintptr_t)p - adjust);
 }

-// forward declaration for a MI_DEBUG_GUARDED build
-static void mi_block_unguard(mi_page_t* page, mi_block_t* block);
+// forward declaration for a MI_GUARDED build
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p); // forward declaration
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  if (mi_block_ptr_is_guarded(block, p)) { mi_block_unguard(page, block, p); }
+}
+#else
+static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(p);
+}
+#endif

 // free a local pointer  (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
  MI_UNUSED(segment);
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
-  mi_block_unguard(page,block);
+  mi_block_check_unguard(page, block, p);
  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }

 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_block_unguard(page, block);
+  mi_block_check_unguard(page, block, p);
  mi_free_block_mt(page, segment, block);
 }

@ -100,17 +109,17 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 {
  MI_UNUSED(msg);

-#if (MI_DEBUG>0)
-  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+  #if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
    return NULL;
  }
-#endif
+  #endif

  mi_segment_t* const segment = _mi_ptr_segment(p);
  if mi_unlikely(segment==NULL) return segment;

-#if (MI_DEBUG>0)
+  #if (MI_DEBUG>0)
  if mi_unlikely(!mi_is_in_heap_region(p)) {
    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
@ -118,13 +127,13 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
    }
  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
+  #endif
+  #if (MI_DEBUG>0 || MI_SECURE>=4)
  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
    return NULL;
  }
-#endif
+  #endif

  return segment;
 }
@ -236,11 +245,12 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
+  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
      #if MI_HUGE_PAGE_ABANDON
      segment->page_kind != MI_PAGE_HUGE &&
      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0)
+      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
+      mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
  {
    // the segment is abandoned, try to reclaim it into our heap
    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
@ -296,20 +306,19 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
  const size_t size = mi_page_usable_size_of(page, block);
  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
+  const size_t aligned_size = (size - adjust);
+  #if MI_GUARDED
+  if (mi_block_ptr_is_guarded(block, p)) {
+    return aligned_size - _mi_os_page_size();
+  }
+  #endif
+  return aligned_size;
 }

 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
  if mi_unlikely(segment==NULL) return 0;
  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  #if MI_DEBUG_GUARDED
-  if (mi_page_has_guarded(page)) {
-    const size_t bsize = mi_page_usable_aligned_size_of(page, p);
-    mi_assert_internal(bsize > _mi_os_page_size());
-    return (bsize > _mi_os_page_size() ? bsize - _mi_os_page_size() : bsize);
-  } else
-  #endif
  if mi_likely(!mi_page_has_aligned(page)) {
    const mi_block_t* block = (const mi_block_t*)p;
    return mi_page_usable_size_of(page, block);
@ -413,7 +422,7 @@ static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* bloc
  uintptr_t keys[2];
  keys[0] = page->keys[0];
  keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  bool ok = (mi_ptr_encode_canary(page,block,keys) == canary && *delta <= *bsize);
  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
  return ok;
 }
@ -532,23 +541,21 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 #endif


-// Remove guard page when building with MI_DEBUG_GUARDED
-#if !MI_DEBUG_GUARDED
-static void mi_block_unguard(mi_page_t* page, mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  // do nothing
-}
-#else
-static void mi_block_unguard(mi_page_t* page, mi_block_t* block) {
-  if (mi_page_has_guarded(page)) {
-    const size_t bsize = mi_page_block_size(page);
-    const size_t psize = _mi_os_page_size();
-    mi_assert_internal(bsize > psize);
-    mi_assert_internal(_mi_page_segment(page)->allow_decommit);
-    void* gpage = (uint8_t*)block + (bsize - psize);
-    mi_assert_internal(_mi_is_aligned(gpage, psize));
-    _mi_os_unprotect(gpage, psize);
-  }
+// Remove guard page when building with MI_GUARDED
+#if MI_GUARDED
+static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) {
+  MI_UNUSED(p);
+  mi_assert_internal(mi_block_ptr_is_guarded(block, p));
+  mi_assert_internal(mi_page_has_aligned(page));
+  mi_assert_internal((uint8_t*)p - (uint8_t*)block >= (ptrdiff_t)sizeof(mi_block_t));
+  mi_assert_internal(block->next == MI_BLOCK_TAG_GUARDED);
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t psize = _mi_os_page_size();
+  mi_assert_internal(bsize > psize);
+  mi_assert_internal(_mi_page_segment(page)->allow_decommit);
+  void* gpage = (uint8_t*)block + bsize - psize;
+  mi_assert_internal(_mi_is_aligned(gpage, psize));
+  _mi_os_unprotect(gpage, psize);
 }
 #endif
--- a/src/heap.c
+++ b/src/heap.c
@ -221,6 +221,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
  heap->cookie  = _mi_heap_random_next(heap) | 1;
  heap->keys[0] = _mi_heap_random_next(heap);
  heap->keys[1] = _mi_heap_random_next(heap);
+  _mi_heap_guarded_init(heap);
  // push on the thread local heaps list
  heap->next = heap->tld->heaps;
  heap->tld->heaps = heap;
@ -369,8 +370,8 @@ void mi_heap_destroy(mi_heap_t* heap) {
  mi_assert(heap->no_reclaim);
  mi_assert_expensive(mi_heap_is_valid(heap));
  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  #if MI_DEBUG_GUARDED
-  _mi_warning_message("'mi_heap_destroy' called but ignored as MI_DEBUG_GUARDED is enabled (heap at %p)\n", heap);
+  #if MI_GUARDED
+  // _mi_warning_message("'mi_heap_destroy' called but MI_GUARDED is enabled -- using `mi_heap_delete` instead (heap at %p)\n", heap);
  mi_heap_delete(heap);
  return;
  #else
@ -543,13 +544,14 @@ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {

 static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
  mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
-  *shift = 64 - mi_clz(divisor - 1);
+  *shift = MI_INTPTR_BITS - mi_clz(divisor - 1);
  *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);
 }

 static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
  mi_assert_internal(n <= UINT32_MAX);
-  return ((((uint64_t)n * magic) >> 32) + n) >> shift;
+  const uint64_t hi = ((uint64_t)n * magic) >> 32;
+  return (size_t)((hi + n) >> shift);
 }

 bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
--- a/src/init.c
+++ b/src/init.c
@ -86,7 +86,8 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 } \
  MI_STAT_COUNT_END_NULL()

 // --------------------------------------------------------
@ -111,6 +112,9 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  NULL,             // next
  false,            // can reclaim
  0,                // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY
 };
@ -151,6 +155,9 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = {
  NULL,             // next heap
  false,            // can reclaim
  0,                // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
  MI_SMALL_PAGES_EMPTY,
  MI_PAGE_QUEUES_EMPTY
 };
@ -159,6 +166,45 @@ bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.

 mi_stats_t _mi_stats_main = { MI_STATS_NULL };

+#if MI_GUARDED
+mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
+  heap->guarded_sample_seed = seed;
+  if (heap->guarded_sample_seed == 0) { 
+    heap->guarded_sample_seed = _mi_heap_random_next(heap); 
+  }
+  heap->guarded_sample_rate  = sample_rate;
+  if (heap->guarded_sample_rate >= 1) {
+    heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate;
+  }
+  heap->guarded_sample_count = heap->guarded_sample_seed;  // count down samples
+}
+
+mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
+  heap->guarded_size_min = min;
+  heap->guarded_size_max = (min > max ? min : max);
+}
+
+void _mi_heap_guarded_init(mi_heap_t* heap) {
+  mi_heap_guarded_set_sample_rate(heap,
+    (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
+    (size_t)mi_option_get(mi_option_guarded_sample_seed));
+  mi_heap_guarded_set_size_bound(heap, 
+    (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );  
+}
+#else
+mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
+  MI_UNUSED(heap); MI_UNUSED(sample_rate); MI_UNUSED(seed);
+}
+
+mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) {
+  MI_UNUSED(heap); MI_UNUSED(min); MI_UNUSED(max);
+}
+void _mi_heap_guarded_init(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+}
+#endif
+

 static void mi_heap_main_init(void) {
  if (_mi_heap_main.cookie == 0) {
@ -174,6 +220,7 @@ static void mi_heap_main_init(void) {
    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
+    _mi_heap_guarded_init(&_mi_heap_main);
  }
 }

@ -508,54 +555,15 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
 // --------------------------------------------------------
-static void mi_cdecl mi_process_done(void);
-
 static bool os_preloading = true;    // true until this module is initialized
-static bool mi_redirected = false;   // true if malloc redirects to mi_malloc

 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool mi_decl_noinline _mi_preloading(void) {
  return os_preloading;
 }

-mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
-  return mi_redirected;
-}
-
-// Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
-#ifdef __cplusplus
-extern "C" {
-#endif
-mi_decl_export void _mi_redirect_entry(DWORD reason) {
-  // called on redirection; careful as this may be called before DllMain
-  if (reason == DLL_PROCESS_ATTACH) {
-    mi_redirected = true;
-  }
-  else if (reason == DLL_PROCESS_DETACH) {
-    mi_redirected = false;
-  }
-  else if (reason == DLL_THREAD_DETACH) {
-    mi_thread_done();
-  }
-}
-__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
-#ifdef __cplusplus
-}
-#endif
-#else
-static bool mi_allocator_init(const char** message) {
-  if (message != NULL) *message = NULL;
-  return true;
-}
-static void mi_allocator_done(void) {
-  // nothing to do
-}
-#endif
-
-// Called once by the process loader
-static void mi_process_load(void) {
+// Called once by the process loader from `src/prim/prim.c`
+void _mi_process_load(void) {
  mi_heap_main_init();
  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
@ -563,17 +571,14 @@ static void mi_process_load(void) {
  #endif
  os_preloading = false;
  mi_assert_internal(_mi_is_main_thread());
-  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
-  atexit(&mi_process_done);
-  #endif
  _mi_options_init();
  mi_process_setup_auto_thread_done();
  mi_process_init();
-  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+  if (_mi_is_redirected()) _mi_verbose_message("malloc is redirected.\n");

  // show message from the redirector (if present)
  const char* msg = NULL;
-  mi_allocator_init(&msg);
+  _mi_allocator_init(&msg);
  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
    _mi_fputs(NULL,NULL,NULL,msg);
  }
@ -594,7 +599,7 @@ static void mi_detect_cpu_features(void) {
 }
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing
+  // nothing 
 }
 #endif

@ -651,7 +656,7 @@ void mi_process_init(void) mi_attr_noexcept {
 }

 // Called when the process is done (through `at_exit`)
-static void mi_cdecl mi_process_done(void) {
+void mi_cdecl _mi_process_done(void) {
  // only shutdown if we were initialized
  if (!_mi_process_is_initialized) return;
  // ensure we are called once
@ -683,64 +688,8 @@ static void mi_cdecl mi_process_done(void) {
  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
-  mi_allocator_done();
+  _mi_allocator_done();
  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
  os_preloading = true; // don't call the C runtime anymore
 }

-
-
-#if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done
-  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
-    MI_UNUSED(reserved);
-    MI_UNUSED(inst);
-    if (reason==DLL_PROCESS_ATTACH) {
-      mi_process_load();
-    }
-    else if (reason==DLL_PROCESS_DETACH) {
-      mi_process_done();
-    }
-    else if (reason==DLL_THREAD_DETACH) {
-      if (!mi_is_redirected()) {
-        mi_thread_done();
-      }
-    }
-    return TRUE;
-  }
-
-#elif defined(_MSC_VER)
-  // MSVC: use data section magic for static libraries
-  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
-  static int _mi_process_init(void) {
-    mi_process_load();
-    return 0;
-  }
-  typedef int(*_mi_crt_callback_t)(void);
-  #if defined(_M_X64) || defined(_M_ARM64)
-    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
-    #pragma section(".CRT$XIU", long, read)
-  #else
-    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
-  #endif
-  #pragma data_seg(".CRT$XIU")
-  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
-  #pragma data_seg()
-
-#elif defined(__cplusplus)
-  // C++: use static initialization to detect process start
-  static bool _mi_process_init(void) {
-    mi_process_load();
-    return (_mi_heap_main.thread_id != 0);
-  }
-  static bool mi_initialized = _mi_process_init();
-
-#elif defined(__GNUC__) || defined(__clang__)
-  // GCC,Clang: use the constructor attribute
-  static void __attribute__((constructor)) _mi_process_init(void) {
-    mi_process_load();
-  }
-
-#else
-#pragma message("define a way to call mi_process_load on your platform")
-#endif
--- a/src/libc.c
+++ b/src/libc.c
@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra,
 }


-static void mi_out_num(uintptr_t x, size_t base, char prefix, char** out, char* end) 
+static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) 
 {
  if (x == 0 || base == 0 || base > 16) {
    if (prefix != 0) { mi_outc(prefix, out, end); }
@ -206,12 +206,13 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      }
      else if (c == 'p' || c == 'x' || c == 'u') {
        // unsigned
-        uintptr_t x = 0;
+        uintmax_t x = 0;
        if (c == 'x' || c == 'u') {
          if (numtype == 'z')       x = va_arg(args, size_t);
          else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
-          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
-                               else x = va_arg(args, unsigned long);
+          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'l')  x = va_arg(args, unsigned long);
+                               else x = va_arg(args, unsigned int);
        }
        else if (c == 'p') {
          x = va_arg(args, uintptr_t);
@ -228,20 +229,21 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
      }
      else if (c == 'i' || c == 'd') {
        // signed
-        intptr_t x = 0;
+        intmax_t x = 0;
        if (numtype == 'z')       x = va_arg(args, intptr_t );
        else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
-        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
-                             else x = va_arg(args, long);
+        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'l')  x = va_arg(args, long);
+                             else x = va_arg(args, int);
        char pre = 0;
        if (x < 0) {
          pre = '-';
-          if (x > INTPTR_MIN) { x = -x; }
+          if (x > INTMAX_MIN) { x = -x; }
        }
        else if (numplus != 0) {
          pre = numplus;
        }
-        mi_out_num((uintptr_t)x, 10, pre, &out, end);
+        mi_out_num((uintmax_t)x, 10, pre, &out, end);
      }
      else if (c >= ' ' && c <= '~') {
        // unknown format
--- a/src/options.c
+++ b/src/options.c
@ -47,6 +47,49 @@ typedef struct mi_option_desc_s {
 #define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
 #define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy

+// Some options can be set at build time for statically linked libraries
+// (use `-DMI_EXTRA_CPPDEFS="opt1=val1;opt2=val2"`)
+//
+// This is useful if we cannot pass them as environment variables
+// (and setting them programmatically would be too late)
+
+#ifndef MI_DEFAULT_VERBOSE
+#define MI_DEFAULT_VERBOSE 0
+#endif
+
+#ifndef MI_DEFAULT_EAGER_COMMIT
+#define MI_DEFAULT_EAGER_COMMIT 1
+#endif
+
+#ifndef MI_DEFAULT_ARENA_EAGER_COMMIT
+#define MI_DEFAULT_ARENA_EAGER_COMMIT 2
+#endif
+
+#ifndef MI_DEFAULT_ARENA_RESERVE
+ #if (MI_INTPTR_SIZE>4)
+  #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L
+ #else
+  #define MI_DEFAULT_ARENA_RESERVE 128L*1024L
+ #endif
+#endif
+
+#ifndef MI_DEFAULT_DISALLOW_ARENA_ALLOC
+#define MI_DEFAULT_DISALLOW_ARENA_ALLOC 0
+#endif
+
+#ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES
+#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES
+#define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0
+#endif
+
+#ifndef MI_DEFAULT_RESERVE_OS_MEMORY
+#define MI_DEFAULT_RESERVE_OS_MEMORY 0
+#endif
+
+
 static mi_option_desc_t options[_mi_option_last] =
 {
  // stable options
@ -56,16 +99,21 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(show_errors) },
 #endif
  { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
+  { MI_DEFAULT_VERBOSE, UNINIT, MI_OPTION(verbose) },

-  // the following options are experimental and not all combinations make sense.
-  { 1, UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
-  { 2, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
+  // some of the following options are experimental and not all combinations are allowed.
+  { MI_DEFAULT_EAGER_COMMIT,
+       UNINIT, MI_OPTION(eager_commit) },               // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
+  { MI_DEFAULT_ARENA_EAGER_COMMIT,
+       UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux)
  { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) },        // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit)
-  { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
+  { MI_DEFAULT_ALLOW_LARGE_OS_PAGES,
+       UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { MI_DEFAULT_RESERVE_HUGE_OS_PAGES,
+       UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
  {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
+  { MI_DEFAULT_RESERVE_OS_MEMORY,
+       UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve N KiB OS memory in advance (use `option_get_size`)
  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
  { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
  { 0, UNINIT, MI_OPTION(abandoned_page_purge) },       // purge free page memory when a thread terminates
@ -83,24 +131,26 @@ static mi_option_desc_t options[_mi_option_last] =
  { 32,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments to be reclaimed per try.
  { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
-  #if (MI_INTPTR_SIZE>4)
-  { 1024L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
-  #else
-  {  128L*1024L, UNINIT, MI_OPTION(arena_reserve) },    // =128MiB on 32-bit
-  #endif
-
+  { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`)
  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
  { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
-  { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
+  { MI_DEFAULT_DISALLOW_ARENA_ALLOC,   UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's)
  { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
-#if defined(MI_VISIT_ABANDONED)  
+#if defined(MI_VISIT_ABANDONED)
  { 1,   INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting heap blocks in abandonded segments; requires taking locks during reclaim.
 #else
-  { 0,   UNINIT, MI_OPTION(visit_abandoned) },          
+  { 0,   UNINIT, MI_OPTION(visit_abandoned) },
 #endif
-  { 0,   UNINIT, MI_OPTION(debug_guarded_min) },        // only used when building with MI_DEBUG_GUARDED: minimal rounded object size for guarded objects
-  { 0,   UNINIT, MI_OPTION(debug_guarded_max) },        // only used when building with MI_DEBUG_GUARDED: maximal rounded object size for guarded objects
+  { 0,   UNINIT, MI_OPTION(guarded_min) },              // only used when building with MI_GUARDED: minimal rounded object size for guarded objects
+  { MI_GiB, UNINIT, MI_OPTION(guarded_max) },           // only used when building with MI_GUARDED: maximal rounded object size for guarded objects
+  { 0,   UNINIT, MI_OPTION(guarded_precise) },          // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0)
+#if MI_GUARDED
+  { 4000,UNINIT, MI_OPTION(guarded_sample_rate)},       // 1 out of N allocations in the min/max range will be guarded(= 1000)
+#else
+  { 0,   UNINIT, MI_OPTION(guarded_sample_rate)},
+#endif
+  { 0,   UNINIT, MI_OPTION(guarded_sample_seed)},
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -123,25 +173,25 @@ void _mi_options_init(void) {
  }
  mi_max_error_count = mi_option_get(mi_option_max_errors);
  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
-  #if MI_DEBUG_GUARDED
-  if (mi_option_get(mi_option_debug_guarded_max) > 0) {
+  #if MI_GUARDED
+  if (mi_option_get(mi_option_guarded_sample_rate) > 0) {
    if (mi_option_is_enabled(mi_option_allow_large_os_pages)) {
      mi_option_disable(mi_option_allow_large_os_pages);
      _mi_warning_message("option 'allow_large_os_pages' is disabled to allow for guarded objects\n");
    }
  }
-  _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_debug_guarded_max) > 0 ? "enabled" : "disabled");
+  _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_max) > 0 ? "enabled" : "disabled");
  #endif
 }

 long _mi_option_get_fast(mi_option_t option) {
  mi_assert(option >= 0 && option < _mi_option_last);
-  mi_option_desc_t* desc = &options[option]; 
+  mi_option_desc_t* desc = &options[option];
  mi_assert(desc->option == option);  // index should match the option
  //mi_assert(desc->init != UNINIT);
  return desc->value;
 }
-  
+

 mi_decl_nodiscard long mi_option_get(mi_option_t option) {
  mi_assert(option >= 0 && option < _mi_option_last);
@ -160,7 +210,6 @@ mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long ma
 }

 mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) {
-  mi_assert_internal(mi_option_has_size_in_kib(option));
  const long x = mi_option_get(option);
  size_t size = (x < 0 ? 0 : (size_t)x);
  if (mi_option_has_size_in_kib(option)) {
@ -177,11 +226,11 @@ void mi_option_set(mi_option_t option, long value) {
  desc->value = value;
  desc->init = INITIALIZED;
  // ensure min/max range; be careful to not recurse.
-  if (desc->option == mi_option_debug_guarded_min && _mi_option_get_fast(mi_option_debug_guarded_max) < value) {
-    mi_option_set(mi_option_debug_guarded_max, value);
+  if (desc->option == mi_option_guarded_min && _mi_option_get_fast(mi_option_guarded_max) < value) {
+    mi_option_set(mi_option_guarded_max, value);
  }
-  else if (desc->option == mi_option_debug_guarded_max && _mi_option_get_fast(mi_option_debug_guarded_min) > value) {
-    mi_option_set(mi_option_debug_guarded_min, value);
+  else if (desc->option == mi_option_guarded_max && _mi_option_get_fast(mi_option_guarded_min) > value) {
+    mi_option_set(mi_option_guarded_min, value);
  }
 }

@ -517,7 +566,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
      char* end = buf;
      long value = strtol(buf, &end, 10);
      if (mi_option_has_size_in_kib(desc->option)) {
-        // this option is interpreted in KiB to prevent overflow of `long` for large allocations 
+        // this option is interpreted in KiB to prevent overflow of `long` for large allocations
        // (long is 32-bit on 64-bit windows, which allows for 4TiB max.)
        size_t size = (value < 0 ? 0 : (size_t)value);
        bool overflow = false;
@ -532,7 +581,7 @@ static void mi_option_init(mi_option_desc_t* desc) {
        value = (size > LONG_MAX ? LONG_MAX : (long)size);
      }
      if (*end == 0) {
-        mi_option_set(desc->option, value);        
+        mi_option_set(desc->option, value);
      }
      else {
        // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
--- a/src/os.c
+++ b/src/os.c
@ -11,16 +11,33 @@ terms of the MIT license. A copy of the license can be found in the file


 /* -----------------------------------------------------------
-  Initialization. 
+  Initialization.
 ----------------------------------------------------------- */
+#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   32
+#else
+#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS   48
+#endif
+#endif
+
+#ifndef MI_DEFAULT_PHYSICAL_MEMORY
+#if MI_INTPTR_SIZE < 8
+#define MI_DEFAULT_PHYSICAL_MEMORY    4*MI_GiB
+#else
+#define MI_DEFAULT_PHYSICAL_MEMORY    32*MI_GiB
+#endif
+#endif

 static mi_os_mem_config_t mi_os_mem_config = {
-  4096,   // page size
-  0,      // large page size (usually 2MiB)
-  4096,   // allocation granularity
-  true,   // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
-  false,  // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
-  true    // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
+  4096,     // page size
+  0,        // large page size (usually 2MiB)
+  4096,     // allocation granularity
+  MI_DEFAULT_PHYSICAL_MEMORY,
+  MI_DEFAULT_VIRTUAL_ADDRESS_BITS,
+  true,     // has overcommit?  (if true we use MAP_NORESERVE on mmap systems)
+  false,    // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span)
+  true      // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory)
 };

 bool _mi_os_has_overcommit(void) {
@ -91,9 +108,10 @@ static void* mi_align_down_ptr(void* p, size_t alignment) {
  aligned hinting
 -------------------------------------------------------------- */

-// On 64-bit systems, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those.
-#if (MI_INTPTR_SIZE >= 8)
+// On systems with enough virtual address bits, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
+// space (64TiB) we use this technique. (but see issue #939)
+#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
 static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;

 // Return a MI_SEGMENT_SIZE aligned address that is probably available.
@ -110,6 +128,7 @@ static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
 {
  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
  size = _mi_align_up(size, MI_SEGMENT_SIZE);
  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
  #if (MI_SECURE>0)
@ -195,7 +214,8 @@ void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) {
 -------------------------------------------------------------- */

 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
+// Also `hint_addr` is a hint and may be ignored.
+static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(is_zero != NULL);
  mi_assert_internal(is_large != NULL);
@ -204,9 +224,9 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
  if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
  *is_zero = false;
  void* p = NULL;
-  int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
+  int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p);
  if (err != 0) {
-    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
+    _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
  }

  MI_UNUSED(tld_stats);
@ -226,6 +246,10 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
  return p;
 }

+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
+  return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero, tld_stats);
+}
+

 // Primitive aligned allocation from the OS.
 // This function guarantees the allocated memory is aligned.
@ -239,7 +263,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());

-  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
+  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
  void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats);
  if (p == NULL) return NULL;

@ -249,7 +273,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  }
  else {
    // if not aligned, free it, overallocate, and unmap around it
+    #if !MI_TRACK_ASAN
    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit);
+    #endif
    mi_os_prim_free(p, size, commit, stats);
    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
    const size_t over_size = size + alignment;
@ -275,7 +301,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
      if (p == NULL) return NULL;

-      // and selectively unmap parts around the over-allocated area. 
+      // and selectively unmap parts around the over-allocated area.
      void* aligned_p = mi_align_up_ptr(p, alignment);
      size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
      size_t mid_size = _mi_align_up(size, _mi_os_page_size());
--- a/src/page.c
+++ b/src/page.c
@ -414,9 +414,6 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {

  // no more aligned blocks in here
  mi_page_set_has_aligned(page, false);
-  #if MI_DEBUG_GUARDED
-  mi_page_set_has_guarded(page, false);
-  #endif

  // remove from the page list
  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
@ -443,9 +440,6 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
  mi_assert_internal(mi_page_all_free(page));

  mi_page_set_has_aligned(page, false);
-  #if MI_DEBUG_GUARDED
-  mi_page_set_has_guarded(page, false);
-  #endif

  // don't retire too often..
  // (or we end up retiring and re-allocating most of the time)
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@ -71,8 +71,8 @@ int _mi_prim_free(void* addr, size_t size) {
 extern void* emmalloc_memalign(size_t alignment, size_t size);

 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
  *is_large = false;
  // TODO: Track the highest address ever seen; first uses of it are zeroes.
  //       That assumes no one else uses sbrk but us (they could go up,
--- a/src/prim/osx/alloc-override-zone.c
+++ b/src/prim/osx/alloc-override-zone.c
@ -418,9 +418,9 @@ static inline malloc_zone_t* mi_get_default_zone(void)
 }

 #if defined(__clang__)
-__attribute__((constructor(0)))
+__attribute__((constructor(101))) // highest priority
 #else
-__attribute__((constructor))      // seems not supported by g++-11 on the M1
+__attribute__((constructor))      // priority level is not supported by gcc
 #endif
 __attribute__((used))
 static void _mi_macos_override_malloc(void) {
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@ -25,3 +25,52 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)

 #endif
+
+// Generic process initialization
+#ifndef MI_PRIM_HAS_PROCESS_ATTACH
+#if defined(__GNUC__) || defined(__clang__)
+  // gcc,clang: use the constructor/destructor attribute
+  // which for both seem to run before regular constructors/destructors
+  #if defined(__clang__)
+    #define mi_attr_constructor __attribute__((constructor(101)))
+    #define mi_attr_destructor  __attribute__((destructor(101)))
+  #else
+    #define mi_attr_constructor __attribute__((constructor))
+    #define mi_attr_destructor  __attribute__((destructor))
+  #endif
+  static void mi_attr_constructor mi_process_attach(void) {
+    _mi_process_load();
+  }
+  static void mi_attr_destructor mi_process_detach(void) {
+    _mi_process_done();
+  }
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start/end
+  // This is not guaranteed to be first/last but the best we can generally do?
+  struct mi_init_done_t {
+    mi_init_done_t() {
+      _mi_process_load();
+    }
+    ~mi_init_done_t() {
+      _mi_process_done();
+    }
+  };
+  static mi_init_done_t mi_init_done;
+ #else
+  #pragma message("define a way to call _mi_process_load/done on your platform")
+#endif
+#endif
+
+// Generic allocator init/done callback 
+#ifndef MI_PRIM_HAS_ALLOCATOR_INIT
+bool _mi_is_redirected(void) {
+  return false;
+}
+bool _mi_allocator_init(const char** message) {
+  if (message != NULL) { *message = NULL; }
+  return true;
+}
+void _mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -139,6 +139,12 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
  if (psize > 0) {
    config->page_size = (size_t)psize;
    config->alloc_granularity = (size_t)psize;
+    #if defined(_SC_PHYS_PAGES)
+    long pphys = sysconf(_SC_PHYS_PAGES);
+    if (pphys > 0 && (size_t)pphys < (SIZE_MAX/(size_t)psize)) {
+      config->physical_memory = (size_t)pphys * (size_t)psize;
+    }
+    #endif
  }
  config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
  config->has_overcommit = unix_detect_overcommit();
@ -351,14 +357,14 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
 }

 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);

  *is_zero = true;
  int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
-  *addr = unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large);
  return (*addr != NULL ? 0 : errno);
 }

--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@ -119,8 +119,8 @@ static void* mi_prim_mem_grow(size_t size, size_t try_alignment) {
 }

 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
-  MI_UNUSED(allow_large); MI_UNUSED(commit);
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr);
  *is_large = false;
  *is_zero = false;
  *addr = mi_prim_mem_grow(size, try_alignment);
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -118,6 +118,18 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
  GetSystemInfo(&si);
  if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; }
  if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; }
+  // get virtual address bits
+  if ((uintptr_t)si.lpMaximumApplicationAddress > 0) {
+    const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress);
+    config->virtual_address_bits = vbits;
+  }
+  // get physical memory
+  ULONGLONG memInKiB = 0;
+  if (GetPhysicallyInstalledSystemMemory(&memInKiB)) {
+    if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) {
+      config->physical_memory = memInKiB * MI_KiB;
+    }
+  }
  // get the VirtualAlloc2 function
  HINSTANCE  hDll;
  hDll = LoadLibrary(TEXT("kernelbase.dll"));
@ -191,7 +203,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali
  }
  #endif
  // on modern Windows try use VirtualAlloc2 for aligned allocation
-  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
    reqs.Alignment = try_alignment;
    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
@ -279,14 +291,14 @@ static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DW
  return p;
 }

-int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
  mi_assert_internal(commit || !allow_large);
  mi_assert_internal(try_alignment > 0);
  *is_zero = true;
  int flags = MEM_RESERVE;
  if (commit) { flags |= MEM_COMMIT; }
-  *addr = win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  *addr = win_virtual_alloc(hint_addr, size, try_alignment, flags, false, allow_large, is_large);
  return (*addr != NULL ? 0 : (int)GetLastError());
 }

@ -499,8 +511,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
  }

  // get process info
-  PROCESS_MEMORY_COUNTERS info;
-  memset(&info, 0, sizeof(info));
+  PROCESS_MEMORY_COUNTERS info; _mi_memzero_var(info);
  if (pGetProcessMemoryInfo != NULL) {
    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
  }
@ -602,60 +613,195 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {

 #endif  // MI_USE_RTLGENRANDOM

+
+
 //----------------------------------------------------------------
-// Thread init/done
+// Process & Thread Init/Done
 //----------------------------------------------------------------

-#if !defined(MI_SHARED_LIB)
-
-// use thread local storage keys to detect thread ending
-// note: another design could be to use special linker sections (see issue #869)
-#include <fibersapi.h>
-#if (_WIN32_WINNT < 0x600)  // before Windows Vista
-WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
-WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
-WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
-WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
-#endif
-
-static DWORD mi_fls_key = (DWORD)(-1);
-
-static void NTAPI mi_fls_done(PVOID value) {
-  mi_heap_t* heap = (mi_heap_t*)value;
-  if (heap != NULL) {
-    _mi_thread_done(heap);
-    FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
+  MI_UNUSED(reserved);
+  MI_UNUSED(module);
+  if (reason==DLL_PROCESS_ATTACH) {
+    _mi_process_load();
+  }
+  else if (reason==DLL_PROCESS_DETACH) {
+    _mi_process_done();
+  }
+  else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) {
+    _mi_thread_done(NULL);
  }
 }

-void _mi_prim_thread_init_auto_done(void) {
-  mi_fls_key = FlsAlloc(&mi_fls_done);
-}

-void _mi_prim_thread_done_auto_done(void) {
-  // call thread-done on all threads (except the main thread) to prevent
-  // dangling callback pointer if statically linked with a DLL; Issue #208
-  FlsFree(mi_fls_key);
-}
+#if defined(MI_SHARED_LIB)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1

-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  mi_assert_internal(mi_fls_key != (DWORD)(-1));
-  FlsSetValue(mi_fls_key, heap);
-}
+  // Windows DLL: easy to hook into process_init and thread_done
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    mi_win_main((PVOID)inst,reason,reserved);
+    return TRUE;
+  }

-#else
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    MI_UNUSED(heap);
+  }

-// Dll; nothing to do as in that case thread_done is handled through the DLL_THREAD_DETACH event.
+#elif !defined(MI_WIN_USE_FLS)
+  #define MI_PRIM_HAS_PROCESS_ATTACH  1

-void _mi_prim_thread_init_auto_done(void) {
-}
+  static void NTAPI mi_win_main_attach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }
+  static void NTAPI mi_win_main_detach(PVOID module, DWORD reason, LPVOID reserved) {
+    if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) {
+      mi_win_main(module, reason, reserved);
+    }
+  }

-void _mi_prim_thread_done_auto_done(void) {
-}
+  // Set up TLS callbacks in a statically linked library by using special data sections.
+  // See <https://stackoverflow.com/questions/14538159/tls-callback-in-windows>
+  // We use 2 entries to ensure we call attach events before constructors
+  // are called, and detach events after destructors are called.
+  #if defined(__cplusplus)
+  extern "C" {
+  #endif

-void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
-  MI_UNUSED(heap);
-}
+  #if defined(_WIN64)
+    #pragma comment(linker, "/INCLUDE:_tls_used")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post")
+    #pragma const_seg(".CRT$XLB")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma const_seg()
+    #pragma const_seg(".CRT$XLY")
+    extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[];
+    const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma const_seg()
+  #else
+    #pragma comment(linker, "/INCLUDE:__tls_used")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre")
+    #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post")
+    #pragma data_seg(".CRT$XLB")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach };
+    #pragma data_seg()
+    #pragma data_seg(".CRT$XLY")
+    PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach };
+    #pragma data_seg()
+  #endif

+  #if defined(__cplusplus)
+  }
+  #endif
+
+  // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event.
+  void _mi_prim_thread_init_auto_done(void) { }
+  void _mi_prim_thread_done_auto_done(void) { }
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    MI_UNUSED(heap);
+  }
+
+#else // deprecated: statically linked, use fiber api
+
+  #if defined(_MSC_VER) // on clang/gcc use the constructor attribute (in `src/prim/prim.c`)
+    // MSVC: use data section magic for static libraries
+    // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+    #define MI_PRIM_HAS_PROCESS_ATTACH 1
+
+    static int mi_process_attach(void) {
+      mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL);
+      atexit(&_mi_process_done);
+      return 0;
+    }
+    typedef int(*mi_crt_callback_t)(void);
+    #if defined(_WIN64)
+      #pragma comment(linker, "/INCLUDE:_mi_tls_callback")
+      #pragma section(".CRT$XIU", long, read)
+    #else
+      #pragma comment(linker, "/INCLUDE:__mi_tls_callback")
+    #endif
+    #pragma data_seg(".CRT$XIU")
+    mi_decl_externc mi_crt_callback_t _mi_tls_callback[] = { &mi_process_attach };
+    #pragma data_seg()
+  #endif
+
+  // use the fiber api for calling `_mi_thread_done`.
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+
+  static DWORD mi_fls_key = (DWORD)(-1);
+
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_heap_t* heap = (mi_heap_t*)value;
+    if (heap != NULL) {
+      _mi_thread_done(heap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+    }
+  }
+
+  void _mi_prim_thread_init_auto_done(void) {
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  }
+
+  void _mi_prim_thread_done_auto_done(void) {
+    // call thread-done on all threads (except the main thread) to prevent
+    // dangling callback pointer if statically linked with a DLL; Issue #208
+    FlsFree(mi_fls_key);
+  }
+
+  void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+    mi_assert_internal(mi_fls_key != (DWORD)(-1));
+    FlsSetValue(mi_fls_key, heap);
+  }
 #endif

+// ----------------------------------------------------
+// Communicate with the redirection module on Windows
+// ----------------------------------------------------
+#if defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+  #define MI_PRIM_HAS_ALLOCATOR_INIT 1
+
+  static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+  bool _mi_is_redirected(void) {
+    return mi_redirected;
+  }
+
+  #ifdef __cplusplus
+  extern "C" {
+  #endif
+  mi_decl_export void _mi_redirect_entry(DWORD reason) {
+    // called on redirection; careful as this may be called before DllMain
+    if (reason == DLL_PROCESS_ATTACH) {
+      mi_redirected = true;
+    }
+    else if (reason == DLL_PROCESS_DETACH) {
+      mi_redirected = false;
+    }
+    else if (reason == DLL_THREAD_DETACH) {
+      _mi_thread_done(NULL);
+    }
+  }
+  __declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+  __declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+  #ifdef __cplusplus
+  }
+  #endif
+  bool _mi_allocator_init(const char** message) {
+    return mi_allocator_init(message);
+  }
+  void _mi_allocator_done(void) {
+    mi_allocator_done();
+  }
+#endif
--- a/src/segment.c
+++ b/src/segment.c
@ -455,7 +455,7 @@ static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_

  if (MI_SECURE == 0) {
    // normally no guard pages
-    #if MI_DEBUG_GUARDED
+    #if MI_GUARDED
    isize = _mi_align_up(minsize, _mi_os_page_size());
    #else
    isize = _mi_align_up(minsize, 16 * MI_MAX_ALIGN_SIZE);
--- a/src/stats.c
+++ b/src/stats.c
@ -118,6 +118,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
  mi_stat_counter_add(&stats->searches, &src->searches, 1);
  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
+  mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
@ -342,6 +343,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
  mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);
  mi_stat_counter_print(&stats->purge_calls, "purges", out, arg);
+  mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg);
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
  _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count());