From db52999d8567c628154fcb002cf636bc1c1697d9 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 8 Mar 2024 09:01:26 -0800
Subject: [PATCH 01/28] update used block comment (issue #861)

---
 include/mimalloc/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 771059bf..049e68e7 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -295,7 +295,7 @@ typedef struct mi_page_s {
   uint8_t               retire_expire:7;   // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
 

From a42707908fdaa4ff5fd09788d74ad70311e9bf85 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 06:33:40 -0700
Subject: [PATCH 02/28] fix typo, issue #866

---
 doc/mimalloc-doc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 4c23a5fa..01b13904 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -168,7 +168,7 @@ void* mi_expand(void* p, size_t newsize);
 /// @returns A pointer to a block of \a count * \a size bytes, or \a NULL
 /// if out of memory or if \a count * \a size overflows.
 ///
-/// If there is no overflow, it behaves exactly like `mi_malloc(p,count*size)`.
+/// If there is no overflow, it behaves exactly like `mi_malloc(count*size)`.
 /// @see mi_calloc()
 /// @see mi_zallocn()
 void* mi_mallocn(size_t count, size_t size);

From 18ebeb8a83386ade978b2d867fafbd3b39641826 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 08:33:09 -0700
Subject: [PATCH 03/28] fix (benign) race condition on the page flags
 has_aligned flag and refactor free-ing code (issue 865)

---
 src/alloc.c | 323 ++++++++++++++++++++++++++++------------------------
 1 file changed, 175 insertions(+), 148 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index 8a76d3d3..2576206f 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -26,7 +26,9 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+// Note: in release mode the (inlined) routine is about 7 instructions with a single test.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+{
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
@@ -61,43 +63,43 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
     }    
   }
 
-#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
   if (!zero && !mi_page_is_huge(page)) {
     memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
   }
-#elif (MI_SECURE!=0)
+  #elif (MI_SECURE!=0)
   if (!zero) { block->next = 0; } // don't leak internal data
-#endif
+  #endif
 
-#if (MI_STAT>0)
+  #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_increase(heap, normal, bsize);
     mi_heap_stat_counter_increase(heap, normal_count, 1);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     const size_t bin = _mi_bin(bsize);
     mi_heap_stat_increase(heap, normal_bins[bin], 1);
-#endif
+    #endif
   }
-#endif
+  #endif
 
-#if MI_PADDING // && !MI_TRACK_ENABLED
+  #if MI_PADDING // && !MI_TRACK_ENABLED
   mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  #if (MI_DEBUG>=2)
-  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  #endif
+    #if (MI_DEBUG>=2)
+    mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+    #endif
   mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
   padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
   padding->delta  = (uint32_t)(delta);
-  #if MI_PADDING_CHECK
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)padding - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
-    for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
-  }
+    #if MI_PADDING_CHECK
+    if (!mi_page_is_huge(page)) {
+      uint8_t* fill = (uint8_t*)padding - delta;
+      const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+      for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+    }
+    #endif
   #endif
-#endif
 
   return block;
 }
@@ -112,9 +114,11 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   #if (MI_PADDING)
   if (size == 0) { size = sizeof(void*); }
   #endif
+  
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
   void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);  
   mi_track_malloc(p,size,zero);
+
   #if MI_STAT>1
   if (p != NULL) {
     if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
@@ -403,115 +407,31 @@ static void mi_stat_huge_free(const mi_page_t* page) {
 // Free
 // ------------------------------------------------------
 
-// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
 {
-  // first see if the segment was abandoned and we can reclaim it
-  mi_segment_t* const segment = _mi_page_segment(page);
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
-  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
-  }
-  
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  // owning thread can free a block directly
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }  
-  
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  mi_stat_free(page, block);
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-
-  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+    _mi_page_retire(page);
   }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }  
 }
 
-// regular free
-static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
-{
-  // and push it on the free list
-  //const size_t bsize = mi_page_block_size(page);
-  if mi_likely(local) {
-    // owning thread can free a block directly
-    if mi_unlikely(mi_check_is_double_free(page, block)) return;
-    mi_check_padding(page, block);
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    page->used--;
-    if mi_unlikely(mi_page_all_free(page)) {
-      _mi_page_retire(page);
-    }
-    else if mi_unlikely(mi_page_is_in_full(page)) {
-      _mi_page_unfull(page);
-    }
-  }
-  else {
-    _mi_free_block_mt(page,block);
-  }
-}
-
-
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
@@ -520,17 +440,27 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
-
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+// free a local pointer
+static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-  _mi_free_block(page, is_local, block);
+  mi_free_block_local(page, block, true);
+}
+
+// free a pointer owned by another thread
+static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_free_block_mt(segment, page, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(segment,page,p);
+           else mi_free_generic_mt(segment,page,p);
 }
 
 // Get the segment data belonging to a pointer
-// This is just a single `and` in assembly but does further checks in debug mode
-// (and secure mode) if this was a valid pointer.
+// This is just a single `and` in release mode but does further checks in debug mode
+// (and secure mode) to see if this was a valid pointer.
 static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
 {
   MI_UNUSED(msg);
@@ -566,7 +496,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 }
 
 // Free a block
-// fast path written carefully to prevent spilling on the stack
+// Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
   if mi_unlikely(p == NULL) return;
@@ -574,31 +504,20 @@ void mi_free(void* p) mi_attr_noexcept
   const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
   mi_page_t* const    page    = _mi_segment_page_of(segment, p);
 
-  if mi_likely(is_local) {                       // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0)  // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-    {
+  if mi_likely(is_local) {                        // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+      // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      if mi_unlikely(mi_check_is_double_free(page, block)) return;
-      mi_check_padding(page, block);
-      mi_stat_free(page, block);
-      #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-      #endif
-      mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-      mi_block_set_next(page, block, page->local_free);
-      page->local_free = block;
-      if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
-        _mi_page_retire(page);
-      }
+      mi_free_block_local(page,block,false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      _mi_free_generic(segment, page, true, p);
+      mi_free_generic_local(segment, page, p);
     }
   }
   else {
     // not thread-local; use generic path
-    _mi_free_generic(segment, page, false, p);
+    mi_free_generic_mt(segment, page, p);
   }
 }
 
@@ -623,10 +542,118 @@ bool _mi_free_delayed_block(mi_block_t* block) {
   _mi_page_free_collect(page, false);
 
   // and free the block (possibly freeing the page as well since used is updated)
-  _mi_free_block(page, true, block);
+  mi_free_block_local(page, block, true);
   return true;
 }
 
+// ------------------------------------------------------
+// Multi-threaded Free (`_mt`)
+// ------------------------------------------------------
+
+// Push a block that is owned by another thread on its page-local thread free
+// list or it's heap delayed free list. Such blocks are later collected by
+// the owning thread in `_mi_free_delayed_block`.
+static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+{
+  // Try to put the block on either the page-local thread free list, 
+  // or the heap delayed free list (if this is the first non-local free in that page)
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+{
+  // first see if the segment was abandoned and if we can reclaim it into our thread
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE && 
+      #endif
+      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  
+  // adjust stats (after padding check and potential recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+ 
+  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  else {
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+    #endif
+  }
+  
+  // and finally free the actual block by pushing it on the owning heap
+  // thread_delayed free list (or heap delayed free list)
+  mi_free_block_delayed_mt(page,block);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
 // Bytes available in a block
 mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
   const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);

From 355f44f373e765f4eb2bbfc91ade0735525e3a31 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 08:44:11 -0700
Subject: [PATCH 04/28] split free routines in a separate file

---
 src/alloc.c | 511 +--------------------------------------------------
 src/free.c  | 519 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 520 insertions(+), 510 deletions(-)
 create mode 100644 src/free.c

diff --git a/src/alloc.c b/src/alloc.c
index 2576206f..76d68d13 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -18,6 +18,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_IN_ALLOC_C
 #include "alloc-override.c"
+#include "free.c"
 #undef MI_IN_ALLOC_C
 
 // ------------------------------------------------------
@@ -194,516 +195,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept
 }
 
 
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
-
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
-  }
-  return false;
-}
-
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
-  }
-  return false;
-}
-
-#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
-
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  bool is_double_free = false;
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    is_double_free = mi_check_is_double_freex(page, block);
-  }
-  return is_double_free;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  return false;
-}
-#endif
-
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if MI_PADDING // && !MI_TRACK_ENABLED
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  *delta = padding->delta;
-  uint32_t canary = padding->canary;
-  uintptr_t keys[2];
-  keys[0] = page->keys[0];
-  keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-  return ok;
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  padding->delta = (uint32_t)new_delta;
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-}
-#else
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
-}
-#endif
-
-#if MI_PADDING && MI_PADDING_CHECK
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)block + bsize - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-    mi_track_mem_defined(fill, maxpad);
-    for (size_t i = 0; i < maxpad; i++) {
-      if (fill[i] != MI_DEBUG_PADDING) {
-        *wrong = bsize - delta + i;
-        ok = false;
-        break;
-      }
-    }
-    mi_track_mem_noaccess(fill, maxpad);
-  }
-  return ok;
-}
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-#else
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-}
-
-#endif
-
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
-  MI_UNUSED(block);
-#endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
-  }
-#if !MI_HUGE_PAGE_ABANDON
-  else {
-    const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
-  }
-#endif
-}
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page); MI_UNUSED(block);
-}
-#endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
-
-// ------------------------------------------------------
-// Free
-// ------------------------------------------------------
-
-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-
-// regular free of a (thread local) block pointer
-// fast path written carefully to prevent spilling on the stack
-static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
-{
-  // owning thread can free a block directly
-  if mi_unlikely(mi_check_is_double_free(page, block)) return;
-  mi_check_padding(page, block);
-  mi_stat_free(page, block);
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  #endif
-  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-  mi_block_set_next(page, block, page->local_free);
-  page->local_free = block;
-  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
-    _mi_page_retire(page);
-  }
-  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
-    _mi_page_unfull(page);
-  }  
-}
-
-// Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
-  mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
-  return (mi_block_t*)((uintptr_t)p - adjust);
-}
-
-// free a local pointer
-static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true);
-}
-
-// free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
-  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_free_block_mt(segment, page, block);
-}
-
-// generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(segment,page,p);
-           else mi_free_generic_mt(segment,page,p);
-}
-
-// Get the segment data belonging to a pointer
-// This is just a single `and` in release mode but does further checks in debug mode
-// (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
-{
-  MI_UNUSED(msg);
-  mi_assert(p != NULL);
-
-#if (MI_DEBUG>0)
-  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
-    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
-
-#if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
-  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  return segment;
-}
-
-// Free a block
-// Fast path written carefully to prevent register spilling on the stack
-void mi_free(void* p) mi_attr_noexcept
-{
-  if mi_unlikely(p == NULL) return;
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
-
-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page,block,false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(segment, page, p);
-    }
-  }
-  else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(segment, page, p);
-  }
-}
-
-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, true);
-  return true;
-}
-
-// ------------------------------------------------------
-// Multi-threaded Free (`_mt`)
-// ------------------------------------------------------
-
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
-{
-  // Try to put the block on either the page-local thread free list, 
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
-{
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
-  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
-  }
-
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-  
-  // adjust stats (after padding check and potential recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
- 
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
-  }
-  
-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
-}
-
-
-// ------------------------------------------------------
-// Usable size
-// ------------------------------------------------------
-
-// Bytes available in a block
-mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
-  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
-  const size_t size = mi_page_usable_size_of(page, block);
-  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
-  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
-}
-
-static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  if (p == NULL) return 0;
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  if mi_likely(!mi_page_has_aligned(page)) {
-    const mi_block_t* block = (const mi_block_t*)p;
-    return mi_page_usable_size_of(page, block);
-  }
-  else {
-    // split out to separate routine for improved code generation
-    return mi_page_usable_aligned_size_of(segment, page, p);
-  }
-}
-
-mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  return _mi_usable_size(p, "mi_usable_size");
-}
-
-
-// ------------------------------------------------------
-// Allocation extensions
-// ------------------------------------------------------
-
-void mi_free_size(void* p, size_t size) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
-  mi_free(p);
-}
-
-void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free_size(p,size);
-}
-
-void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free(p);
-}
-
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
diff --git a/src/free.c b/src/free.c
new file mode 100644
index 00000000..7761cb6a
--- /dev/null
+++ b/src/free.c
@@ -0,0 +1,519 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+#endif
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
+  MI_UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+#if !MI_HUGE_PAGE_ABANDON
+  else {
+    const size_t bpsize = mi_page_block_size(page);
+    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, huge, bpsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, giant, bpsize);
+    }
+  }
+#endif
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, giant, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
+{
+  // owning thread can free a block directly
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
+  mi_check_padding(page, block);
+  mi_stat_free(page, block);
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+  #endif
+  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+    _mi_page_retire(page);
+  }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }  
+}
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+// free a local pointer
+static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_free_block_local(page, block, true);
+}
+
+// free a pointer owned by another thread
+static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_free_block_mt(segment, page, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(segment,page,p);
+           else mi_free_generic_mt(segment,page,p);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in release mode but does further checks in debug mode
+// (and secure mode) to see if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+{
+  MI_UNUSED(msg);
+  mi_assert(p != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(!mi_is_in_heap_region(p)) {
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  return segment;
+}
+
+// Free a block
+// Fast path written carefully to prevent register spilling on the stack
+void mi_free(void* p) mi_attr_noexcept
+{
+  if mi_unlikely(p == NULL) return;
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                        // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+      // thread-local, aligned, and not a full page
+      mi_block_t* const block = (mi_block_t*)p;
+      mi_free_block_local(page,block,false /* no need to check if the page is full */);
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      mi_free_generic_local(segment, page, p);
+    }
+  }
+  else {
+    // not thread-local; use generic path
+    mi_free_generic_mt(segment, page, p);
+  }
+}
+
+// return true if successful
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
+    return false;
+  }
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  mi_free_block_local(page, block, true);
+  return true;
+}
+
+// ------------------------------------------------------
+// Multi-threaded Free (`_mt`)
+// ------------------------------------------------------
+
+// Push a block that is owned by another thread on its page-local thread free
+// list or it's heap delayed free list. Such blocks are later collected by
+// the owning thread in `_mi_free_delayed_block`.
+static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+{
+  // Try to put the block on either the page-local thread free list, 
+  // or the heap delayed free list (if this is the first non-local free in that page)
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+{
+  // first see if the segment was abandoned and if we can reclaim it into our thread
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE && 
+      #endif
+      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  
+  // adjust stats (after padding check and potential recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+ 
+  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  else {
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+    #endif
+  }
+  
+  // and finally free the actual block by pushing it on the owning heap
+  // thread_delayed free list (or heap delayed free list)
+  mi_free_block_delayed_mt(page,block);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
+// Bytes available in a block
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  if (p == NULL) return 0;
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_likely(!mi_page_has_aligned(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}

From cc809b0cd4b99a564b00224cb2e66e4d881f62cd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 01:40:03 -0700
Subject: [PATCH 05/28] take 16 bits from used field to create a fast unalign
 path

---
 include/mimalloc/internal.h |  2 +-
 include/mimalloc/types.h    | 24 +++++++++++++-----------
 src/alloc.c                 |  2 +-
 src/free.c                  | 17 +++++++++++------
 src/init.c                  | 18 ++++++++++--------
 src/page.c                  | 12 +++++++++++-
 6 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 96f3922e..72544c3d 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -202,7 +202,7 @@ void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 // "libc.c"
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 049e68e7..c624e5b4 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -273,7 +273,7 @@ typedef uintptr_t mi_thread_free_t;
 //    and 12 are still good for address calculation)
 // - To limit the structure size, the `xblock_size` is 32-bits only; for
 //   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
-// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
@@ -295,19 +295,21 @@ typedef struct mi_page_s {
   uint8_t               retire_expire:7;   // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint32_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for quick block start finding for aligned pointers)
+  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for quick block start finding for aligned pointers)
+  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
-  #endif
+  #endif             
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
-
-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+  
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 } mi_page_t;
 
 
@@ -386,8 +388,8 @@ typedef struct mi_segment_s {
   uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
 
   // layout like this to optimize access in `mi_free`
-  size_t                 page_shift;     // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
   mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
   mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
@@ -446,8 +448,6 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
   _Atomic(mi_block_t*)  thread_delayed_free;
   mi_threadid_t         thread_id;                           // thread this heap belongs too
   mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
@@ -459,6 +459,8 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")  
 };
 
 
diff --git a/src/alloc.c b/src/alloc.c
index 76d68d13..3a38a226 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -37,8 +37,8 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
-  page->used++;
   page->free = mi_block_next(page, block);
+  page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
   #if MI_DEBUG>3
   if (page->free_is_zero) {
diff --git a/src/free.c b/src/free.c
index 7761cb6a..d0fcf133 100644
--- a/src/free.c
+++ b/src/free.c
@@ -249,25 +249,30 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
+  const size_t diff = (mi_likely(page->block_offset_adj != 0) 
+                        ? (uint8_t*)p - (uint8_t*)page - 8*(page->block_offset_adj-1)
+                        : (uint8_t*)p - _mi_page_start(segment, page, NULL));
+                      
+  const size_t adjust = (mi_likely(page->block_size_shift != 0) 
+                          ? diff & (((size_t)1 << page->block_size_shift) - 1)
+                          : diff % mi_page_block_size(page));
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
 // free a local pointer
-static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_local(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   mi_free_block_local(page, block, true);
 }
 
 // free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
   if (is_local) mi_free_generic_local(segment,page,p);
            else mi_free_generic_mt(segment,page,p);
 }
@@ -469,7 +474,7 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // ------------------------------------------------------
 
 // Bytes available in a block
-mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
   const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
   const size_t size = mi_page_usable_size_of(page, block);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
diff --git a/src/init.c b/src/init.c
index 7ec6e01e..11471760 100644
--- a/src/init.c
+++ b/src/init.c
@@ -21,9 +21,11 @@ const mi_page_t _mi_page_empty = {
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
-  0,       // used
-  0,       // xblock_size
   NULL,    // local_free
+  0,       // used
+  0,       // block size shift
+  0,       // block offset adj
+  0,       // xblock_size
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
@@ -93,8 +95,6 @@ const mi_page_t _mi_page_empty = {
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // tid
   0,                // cookie
@@ -104,7 +104,9 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
-  false
+  false,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
 
@@ -130,8 +132,6 @@ static mi_tld_t tld_main = {
 
 mi_heap_t _mi_heap_main = {
   &tld_main,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
   0,                // initial cookie
@@ -141,7 +141,9 @@ mi_heap_t _mi_heap_main = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
-  false             // can reclaim
+  false,            // can reclaim
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
diff --git a/src/page.c b/src/page.c
index 5fefc3b5..5930a430 100644
--- a/src/page.c
+++ b/src/page.c
@@ -660,7 +660,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_page_set_heap(page, heap);
   size_t page_size;
   const void*  page_start = _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
-  MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);
   page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
   mi_assert_internal(page_size / block_size < (1L<<16));
@@ -677,6 +676,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
   }
   #endif
+  if (_mi_is_power_of_two(block_size) && block_size > 0) {
+    page->block_size_shift = (uint32_t)(mi_ctz((uintptr_t)block_size));
+  }
+  const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
+  const ptrdiff_t start_adjust = start_offset % block_size;
+  if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
+    page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+  }
+  
   
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
@@ -690,6 +698,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
+  mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
+  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - 8*(page->block_offset_adj-1))) % block_size == 0);
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list

From d08b4219e9bdbc83c92cd50e1d54b24a939a2271 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 03:32:06 -0700
Subject: [PATCH 06/28] improve used decrement code gen

---
 src/free.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index d0fcf133..6cdac123 100644
--- a/src/free.c
+++ b/src/free.c
@@ -238,7 +238,9 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
-  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+  const uint32_t used = page->used - 1;
+  page->used = used;
+  if mi_unlikely(used == 0) {   // generates better code than: --page->used == 0
     _mi_page_retire(page);
   }
   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {

From 34d37fa2048f3bd65d017bf4f295b3b5b97ae98e Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 03:47:54 -0700
Subject: [PATCH 07/28] nicer organisation of free.c

---
 src/free.c | 431 +++++++++++++++++++++++++++--------------------------
 1 file changed, 222 insertions(+), 209 deletions(-)

diff --git a/src/free.c b/src/free.c
index 6cdac123..e7547aa3 100644
--- a/src/free.c
+++ b/src/free.c
@@ -8,214 +8,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
 #endif
 
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
+// forward declarations
+static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
+static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
-  }
-  return false;
-}
-
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
-  }
-  return false;
-}
-
-#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
-
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  bool is_double_free = false;
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    is_double_free = mi_check_is_double_freex(page, block);
-  }
-  return is_double_free;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  return false;
-}
-#endif
-
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if MI_PADDING // && !MI_TRACK_ENABLED
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  *delta = padding->delta;
-  uint32_t canary = padding->canary;
-  uintptr_t keys[2];
-  keys[0] = page->keys[0];
-  keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-  return ok;
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  padding->delta = (uint32_t)new_delta;
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-}
-#else
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
-}
-#endif
-
-#if MI_PADDING && MI_PADDING_CHECK
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)block + bsize - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-    mi_track_mem_defined(fill, maxpad);
-    for (size_t i = 0; i < maxpad; i++) {
-      if (fill[i] != MI_DEBUG_PADDING) {
-        *wrong = bsize - delta + i;
-        ok = false;
-        break;
-      }
-    }
-    mi_track_mem_noaccess(fill, maxpad);
-  }
-  return ok;
-}
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-#else
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-}
-
-#endif
-
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
-  MI_UNUSED(block);
-#endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
-  }
-#if !MI_HUGE_PAGE_ABANDON
-  else {
-    const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
-  }
-#endif
-}
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page); MI_UNUSED(block);
-}
-#endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
 
 // ------------------------------------------------------
 // Free
@@ -417,8 +215,12 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
   }
 }
 
+#if MI_HUGE_PAGE_ABANDON
+static void mi_stat_huge_free(const mi_page_t* page);
+#endif    
+
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
   if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
@@ -504,7 +306,7 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 
 
 // ------------------------------------------------------
-// Allocation extensions
+// Free variants
 // ------------------------------------------------------
 
 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
@@ -524,3 +326,214 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
   mi_assert(((uintptr_t)p % alignment) == 0);
   mi_free(p);
 }
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
+  MI_UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+#if !MI_HUGE_PAGE_ABANDON
+  else {
+    const size_t bpsize = mi_page_block_size(page);
+    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, huge, bpsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, giant, bpsize);
+    }
+  }
+#endif
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, giant, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif

From 9085596eab602d868129ec44b62a8f6ea7c40d16 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 23 Mar 2024 08:57:29 -0700
Subject: [PATCH 08/28] update comment

---
 include/mimalloc/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index c624e5b4..7ab0a325 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -297,8 +297,8 @@ typedef struct mi_page_s {
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for quick block start finding for aligned pointers)
-  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for quick block start finding for aligned pointers)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
   
   #if (MI_ENCODE_FREELIST || MI_PADDING)

From 60c4a0fe5608603c2a7c0d45eaa1c73ca830c275 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 08:10:35 -0700
Subject: [PATCH 09/28] fix compilation warnings for new uint16_t size for used
 field

---
 include/mimalloc/internal.h | 10 ++---
 include/mimalloc/types.h    | 12 +++---
 src/free.c                  | 75 ++++++++++++++++++++++---------------
 src/init.c                  | 16 ++++----
 src/page.c                  |  8 ++--
 5 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 72544c3d..3aad1ba4 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
-#define mi_decl_weak            
+#define mi_decl_weak
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
@@ -40,7 +40,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
-#define mi_decl_weak           
+#define mi_decl_weak
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
@@ -91,7 +91,7 @@ void       _mi_thread_data_collect(void);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);  
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
 void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
 void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
 
@@ -132,8 +132,8 @@ void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 size_t     _mi_arena_segment_abandoned_count(void);
 
 typedef struct mi_arena_field_cursor_s { // abstract
-  mi_arena_id_t  start;   
-  int            count;   
+  mi_arena_id_t  start;
+  int            count;
   size_t         bitmap_idx;
 } mi_arena_field_cursor_t;
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 7ab0a325..ad0aabe9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -300,14 +300,14 @@ typedef struct mi_page_s {
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
-  
+
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
-  #endif             
+  #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
-  
+
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 } mi_page_t;
@@ -373,7 +373,7 @@ typedef struct mi_segment_s {
   bool                 allow_decommit;
   bool                 allow_purge;
   size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  
+
   // segment fields
   struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
   struct mi_segment_s* prev;
@@ -450,7 +450,7 @@ struct mi_heap_s {
   mi_tld_t*             tld;
   _Atomic(mi_block_t*)  thread_delayed_free;
   mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -460,7 +460,7 @@ struct mi_heap_s {
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")  
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
 
 
diff --git a/src/free.c b/src/free.c
index e7547aa3..4e031028 100644
--- a/src/free.c
+++ b/src/free.c
@@ -6,6 +6,11 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 #if !defined(MI_IN_ALLOC_C)
 #error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+// add includes help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
 #endif
 
 // forward declarations
@@ -26,7 +31,7 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
 {
-  // owning thread can free a block directly
+  // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
   mi_stat_free(page, block);
@@ -34,47 +39,57 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
   mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+
+  // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
-  const uint32_t used = page->used - 1;
-  page->used = used;
-  if mi_unlikely(used == 0) {   // generates better code than: --page->used == 0
+  if mi_unlikely(--page->used == 0) {
     _mi_page_retire(page);
   }
   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
     _mi_page_unfull(page);
-  }  
+  }
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff = (mi_likely(page->block_offset_adj != 0) 
-                        ? (uint8_t*)p - (uint8_t*)page - 8*(page->block_offset_adj-1)
-                        : (uint8_t*)p - _mi_page_start(segment, page, NULL));
-                      
-  const size_t adjust = (mi_likely(page->block_size_shift != 0) 
-                          ? diff & (((size_t)1 << page->block_size_shift) - 1)
-                          : diff % mi_page_block_size(page));
+
+  size_t diff;
+  if mi_likely(page->block_offset_adj != 0) {
+    diff = (uint8_t*)p - (uint8_t*)page - 8 * (page->block_offset_adj - 1);
+  }
+  else {
+    diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  }
+
+  size_t adjust;
+  if mi_likely(page->block_size_shift != 0) {
+    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
+  }
+  else {
+    adjust = diff % mi_page_block_size(page);
+  }
+
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
-// free a local pointer
-static void mi_decl_noinline mi_free_generic_local(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+// free a local pointer  (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   mi_free_block_local(page, block, true);
 }
 
-// free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+// free a pointer owned by another thread (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
 // generic free (for runtime integration)
 void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(segment,page,p);
-           else mi_free_generic_mt(segment,page,p);
+  if (is_local) mi_free_generic_local(page,segment,p);
+           else mi_free_generic_mt(page,segment,p);
 }
 
 // Get the segment data belonging to a pointer
@@ -127,16 +142,16 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page,block,false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(segment, page, p);
+      mi_free_generic_local(page, segment, p);
     }
   }
   else {
     // not thread-local; use generic path
-    mi_free_generic_mt(segment, page, p);
+    mi_free_generic_mt(page, segment, p);
   }
 }
 
@@ -174,7 +189,7 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 // the owning thread in `_mi_free_delayed_block`.
 static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
 {
-  // Try to put the block on either the page-local thread free list, 
+  // Try to put the block on either the page-local thread free list,
   // or the heap delayed free list (if this is the first non-local free in that page)
   mi_thread_free_t tfreex;
   bool use_delayed;
@@ -217,17 +232,17 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 
 #if MI_HUGE_PAGE_ABANDON
 static void mi_stat_huge_free(const mi_page_t* page);
-#endif    
+#endif
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
       #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
+      segment->page_kind != MI_PAGE_HUGE &&
       #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+      mi_atomic_load_relaxed(&segment->thread_id) == 0)
   {
     // the segment is abandoned, try to reclaim it into our heap
     if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
@@ -240,13 +255,13 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
-  
+
   // adjust stats (after padding check and potential recursive `mi_free` above)
   mi_stat_free(page, block);    // stat_free may access the padding
   mi_track_free_size(block, mi_page_usable_size_of(page,block));
- 
+
   // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
   if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
@@ -266,7 +281,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
     memset(block, MI_DEBUG_FREED, mi_usable_size(block));
     #endif
   }
-  
+
   // and finally free the actual block by pushing it on the owning heap
   // thread_delayed free list (or heap delayed free list)
   mi_free_block_delayed_mt(page,block);
diff --git a/src/init.c b/src/init.c
index 11471760..8a20daca 100644
--- a/src/init.c
+++ b/src/init.c
@@ -224,7 +224,7 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
       is_zero = memid.initially_zero;
     }
   }
-  
+
   if (td != NULL && !is_zero) {
     _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
   }
@@ -399,23 +399,23 @@ void mi_thread_done(void) mi_attr_noexcept {
   _mi_thread_done(NULL);
 }
 
-void _mi_thread_done(mi_heap_t* heap) 
+void _mi_thread_done(mi_heap_t* heap)
 {
   // calling with NULL implies using the default heap
-  if (heap == NULL) { 
-    heap = mi_prim_get_default_heap(); 
+  if (heap == NULL) {
+    heap = mi_prim_get_default_heap();
     if (heap == NULL) return;
   }
 
   // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
   if (!mi_heap_is_initialized(heap)) {
-    return; 
+    return;
   }
 
   // adjust stats
   mi_atomic_decrement_relaxed(&thread_count);
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
-  
+
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
 
@@ -437,7 +437,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  _mi_prim_thread_associate_default_heap(heap);    
+  _mi_prim_thread_associate_default_heap(heap);
 }
 
 
@@ -597,7 +597,7 @@ static void mi_cdecl mi_process_done(void) {
 
   // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
   _mi_prim_thread_done_auto_done();
-  
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
     #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
diff --git a/src/page.c b/src/page.c
index 5930a430..8721a063 100644
--- a/src/page.c
+++ b/src/page.c
@@ -192,8 +192,8 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   if (head == NULL) return;
 
   // find the tail -- also to get a proper count (without data races)
-  uint32_t max_count = page->capacity; // cannot collect more than capacity
-  uint32_t count = 1;
+  size_t max_count = page->capacity; // cannot collect more than capacity
+  size_t count = 1;
   mi_block_t* tail = head;
   mi_block_t* next;
   while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
@@ -211,7 +211,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   page->local_free = head;
 
   // update counts now
-  page->used -= count;
+  page->used -= (uint16_t)count;
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
@@ -677,7 +677,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   }
   #endif
   if (_mi_is_power_of_two(block_size) && block_size > 0) {
-    page->block_size_shift = (uint32_t)(mi_ctz((uintptr_t)block_size));
+    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
   const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
   const ptrdiff_t start_adjust = start_offset % block_size;

From 4f809aadb7663d67758db84c12d2fcb8b877b46b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 08:29:56 -0700
Subject: [PATCH 10/28] use free field for expiration instead of used

---
 src/free.c    |  2 +-
 src/page.c    | 23 ++++++++++----------
 src/segment.c | 59 +++++++++++++++++++++++++++++++--------------------
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/src/free.c b/src/free.c
index 4e031028..9579eecb 100644
--- a/src/free.c
+++ b/src/free.c
@@ -57,7 +57,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
   size_t diff;
   if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - 8 * (page->block_offset_adj - 1);
+    diff = (uint8_t*)p - (uint8_t*)page - (8*(page->block_offset_adj - 1));
   }
   else {
     diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
diff --git a/src/page.c b/src/page.c
index 8721a063..d9e416b2 100644
--- a/src/page.c
+++ b/src/page.c
@@ -261,7 +261,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  
+
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -676,16 +676,17 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  if (_mi_is_power_of_two(block_size) && block_size > 0) {
+  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
-  const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
-  const ptrdiff_t start_adjust = start_offset % block_size;
-  if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
-    page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+  if (block_size > 0) {
+    const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
+    const ptrdiff_t start_adjust = start_offset % block_size;
+    if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
+      page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+    }
   }
-  
-  
+
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -723,7 +724,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
-    #if MI_STAT    
+    #if MI_STAT
     count++;
     #endif
 
@@ -880,7 +881,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   else {
     // otherwise find a page with free blocks in our size segregated queues
     #if MI_PADDING
-    mi_assert_internal(size >= MI_PADDING_SIZE); 
+    mi_assert_internal(size >= MI_PADDING_SIZE);
     #endif
     return mi_find_free_page(heap, size);
   }
@@ -896,7 +897,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   // initialize if necessary
   if mi_unlikely(!mi_heap_is_initialized(heap)) {
-    heap = mi_heap_get_default(); // calls mi_thread_init 
+    heap = mi_heap_get_default(); // calls mi_thread_init
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
diff --git a/src/segment.c b/src/segment.c
index a6522028..7d406a96 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -237,12 +237,12 @@ static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tl
   mi_assert_internal(!page->segment_in_use);
   if (!segment->allow_purge) return;
   mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->free == NULL);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   size_t psize;
   void* start = mi_segment_raw_page_start(segment, page, &psize);
   const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
   if (needs_recommit) { page->is_committed = false; }
-  page->used = 0;
 }
 
 static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
@@ -258,6 +258,7 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   if (!ok) return false; // failed to commit!
   page->is_committed = true;
   page->used = 0;
+  page->free = NULL;
   page->is_zero_init = is_zero;
   if (gsize > 0) {
     mi_segment_protect_range(start + psize, gsize, true);
@@ -270,18 +271,30 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   The free page queue
 ----------------------------------------------------------- */
 
-// we re-use the `used` field for the expiration counter. Since this is a
-// a 32-bit field while the clock is always 64-bit we need to guard
-// against overflow, we use substraction to check for expiry which work
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which works
 // as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
-static void mi_page_purge_set_expire(mi_page_t* page) {
-  mi_assert_internal(page->used == 0);
-  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
-  page->used = expire;
+static uint32_t mi_page_get_expire( mi_page_t* page ) {
+  return (uint32_t)((uintptr_t)page->free);
 }
 
+static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) {
+  page->free = (mi_block_t*)((uintptr_t)expire);
+}
+
+static void mi_page_purge_set_expire(mi_page_t* page) {
+  mi_assert_internal(mi_page_get_expire(page)==0);
+  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
+  mi_page_set_expire(page, expire);
+}
+
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which work
+// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
 static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) {
-  int32_t expire = (int32_t)(page->used);
+  int32_t expire = (int32_t)mi_page_get_expire(page);
   return (((int32_t)now - expire) >= 0);
 }
 
@@ -320,14 +333,14 @@ static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_page_queue_t* pq = &tld->pages_purge;
   mi_assert_internal(pq!=NULL);
   mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(page->used != 0);
+  mi_assert_internal(mi_page_get_expire(page) != 0);
   mi_assert_internal(mi_pages_purge_contains(page, tld));
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == pq->last)  pq->last = page->prev;
   if (page == pq->first) pq->first = page->next;
   page->next = page->prev = NULL;
-  page->used = 0;
+  mi_page_set_expire(page,0);
 }
 
 static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) {
@@ -493,7 +506,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   }
   MI_UNUSED(fully_committed);
   mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-  
+
   _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
@@ -592,7 +605,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   if (segment == NULL) return NULL;
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
   mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
-  
+
   // zero the segment info (but not the `mem` fields)
   ptrdiff_t ofs = offsetof(mi_segment_t, next);
   _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
@@ -746,21 +759,21 @@ Abandonment
 When threads terminate, they can leave segments with
 live blocks (reached through other threads). Such segments
 are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually. The 
+reuse their pages and/or free them eventually. The
 `thread_id` of such segments is 0.
 
 When a block is freed in an abandoned segment, the segment
-is reclaimed into that thread. 
+is reclaimed into that thread.
 
 Moreover, if threads are looking for a fresh segment, they
 will first consider abondoned segments -- these can be found
-by scanning the arena memory 
-(segments outside arena memoryare only reclaimed by a free). 
+by scanning the arena memory
+(segments outside arena memoryare only reclaimed by a free).
 ----------------------------------------------------------- */
 
 // legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
-  // nothing needed 
+  // nothing needed
 }
 
 /* -----------------------------------------------------------
@@ -914,12 +927,12 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
-  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
+  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
   // don't reclaim more from a free than half the current segments
   // this is to prevent a pure free-ing thread to start owning too many segments
-  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;  
+  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
   if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
-    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);    
+    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
     mi_assert_internal(res == segment);
     return (res != NULL);
   }
@@ -946,11 +959,11 @@ static long mi_segment_get_reclaim_tries(void) {
 
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
-  *reclaimed = false;  
+  *reclaimed = false;
   mi_segment_t* segment;
   mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
   long max_tries = mi_segment_get_reclaim_tries();
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) 
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments

From ea6137a5017a407ffedafd2757ee6d4a840668fc Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 09:01:58 -0700
Subject: [PATCH 11/28] use MI_MAX_ALIGN_SIZE to adjust block_offset_adj

---
 include/mimalloc/types.h | 4 ++--
 src/page.c               | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ad0aabe9..5bc49aa0 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -297,8 +297,8 @@ typedef struct mi_page_s {
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_offset_adj;  // if not zero, then `(mi_page_start(_,page,_) - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(block_offset_adj-1)) % block_size == 0)` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
diff --git a/src/page.c b/src/page.c
index d9e416b2..912f969a 100644
--- a/src/page.c
+++ b/src/page.c
@@ -682,8 +682,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   if (block_size > 0) {
     const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
     const ptrdiff_t start_adjust = start_offset % block_size;
-    if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
-      page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+    if (start_offset >= 0 && (start_adjust % MI_MAX_ALIGN_SIZE) == 0 && (start_adjust / MI_MAX_ALIGN_SIZE) < 255) {
+      const ptrdiff_t adjust = (start_adjust / MI_MAX_ALIGN_SIZE);
+      mi_assert_internal(adjust + 1 == (uint8_t)(adjust + 1));
+      page->block_offset_adj = (uint8_t)(adjust + 1);
     }
   }
 
@@ -700,7 +702,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[1] != 0);
   #endif
   mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
-  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - 8*(page->block_offset_adj-1))) % block_size == 0);
+  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(page->block_offset_adj-1))) % block_size == 0);
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list

From 86475a7b9bbd5fdd756cd89a579c3d8368708e2f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 09:03:01 -0700
Subject: [PATCH 12/28] use MI_MAX_ALIGN_SIZE to adjust block_offset_adj

---
 src/free.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index 9579eecb..7a5a7806 100644
--- a/src/free.c
+++ b/src/free.c
@@ -57,7 +57,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
   size_t diff;
   if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - (8*(page->block_offset_adj - 1));
+    diff = (uint8_t*)p - (uint8_t*)page - (MI_MAX_ALIGN_SIZE*(page->block_offset_adj - 1));
   }
   else {
     diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);

From 07ae64bd81bc411d740e93f8f98551150e769d1f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:39:22 -0700
Subject: [PATCH 13/28] faster check for NULL in mi_free by combining with
 masking

---
 CMakeLists.txt              | 21 ++++++++++++++++++---
 include/mimalloc/internal.h | 11 +++++++++--
 src/free.c                  | 13 +++++++------
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cc7e575..7c0f67af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,17 @@ endif()
 # Process options
 # -----------------------------------------------------------------------------
 
+# put -Wall early so other warnings can be disabled selectively
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  list(APPEND mi_cflags -Wall -Wextra -Wpedantic)
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+    list(APPEND mi_cflags -Wall -Wextra)
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+    list(APPEND mi_cflags -Wall)
+endif()
+
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
   set(MI_USE_CXX "ON")
 endif()
@@ -186,6 +197,10 @@ endif()
 if(MI_SEE_ASM)
   message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
   list(APPEND mi_cflags -save-temps)
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+    message(STATUS "No GNU Line marker")
+    list(APPEND mi_cflags -Wno-gnu-line-marker)
+  endif()
 endif()
 
 if(MI_CHECK_FULL)
@@ -279,17 +294,17 @@ endif()
 
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
-  list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden)
+  list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden)
   if(NOT MI_USE_CXX)
     list(APPEND mi_cflags -Wstrict-prototypes)
   endif()
   if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
-    list(APPEND mi_cflags -Wpedantic -Wno-static-in-inline)
+    list(APPEND mi_cflags -Wno-static-in-inline)
   endif()
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-  list(APPEND mi_cflags -Wall -fvisibility=hidden)
+  list(APPEND mi_cflags -fvisibility=hidden)
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3aad1ba4..02200594 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -416,13 +416,19 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
 // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
 // therefore we align one byte before `p`.
+// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
 static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_assert_internal(p != NULL);
-  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  #if MI_INTPTR_SIZE <= 4
+  return (p==NULL ? NULL : segment);
+  #else
+  return ((intptr_t)segment <= 0 ? NULL : segment);
+  #endif
 }
 
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_assert_internal(page!=NULL);
   mi_segment_t* segment = _mi_ptr_segment(page);
   mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
   return segment;
@@ -454,6 +460,7 @@ static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_ptr_page(void* p) {
+  mi_assert_internal(p!=NULL);
   return _mi_segment_page_of(_mi_ptr_segment(p), p);
 }
 
diff --git a/src/free.c b/src/free.c
index 7a5a7806..0e560e53 100644
--- a/src/free.c
+++ b/src/free.c
@@ -98,7 +98,6 @@ void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, b
 static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
 {
   MI_UNUSED(msg);
-  mi_assert(p != NULL);
 
 #if (MI_DEBUG>0)
   if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
@@ -108,7 +107,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 #endif
 
   mi_segment_t* const segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
+  if mi_unlikely(segment==NULL) return segment;
 
 #if (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
@@ -133,10 +132,11 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  if mi_unlikely(p == NULL) return;
   mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+  if mi_unlikely(segment==NULL) return;
+
+  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
 
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
@@ -158,6 +158,7 @@ void mi_free(void* p) mi_attr_noexcept
 // return true if successful
 bool _mi_free_delayed_block(mi_block_t* block) {
   // get segment and page
+  mi_assert_internal(block!=NULL);
   const mi_segment_t* const segment = _mi_ptr_segment(block);
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(_mi_thread_id() == segment->thread_id);
@@ -302,8 +303,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  if (p == NULL) return 0;
   const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  if mi_unlikely(segment==NULL) return 0;
   const mi_page_t* const page = _mi_segment_page_of(segment, p);
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;

From f141ca12a49da87b7740a9cbd67ebf14d31d145c Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:53:09 -0700
Subject: [PATCH 14/28] add extra runtime check to ensure we never insert large
 or huge pages in the segment free queue (issue #870)

---
 src/segment.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/segment.c b/src/segment.c
index 7d406a96..e7e7d2cc 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -746,8 +746,10 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
     }
     else if (segment->used + 1 == segment->capacity) {
       mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
-      // move back to segments  free list
-      mi_segment_insert_in_free_queue(segment,tld);
+      if (segment->page_kind <= MI_PAGE_MEDIUM) {
+        // move back to segments  free list
+        mi_segment_insert_in_free_queue(segment,tld);
+      }
     }
   }
 }

From 6688b45fbdd2b3091a852e6db37454a3a7897061 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:57:02 -0700
Subject: [PATCH 15/28] rename MI_ALIGNMENT_MAX to MI_BLOCK_ALIGNMENT_MAX for
 clarity

---
 doc/mimalloc-doc.h       |  4 ++--
 include/mimalloc/types.h |  4 ++--
 src/alloc-aligned.c      | 10 ++++-----
 src/os.c                 | 48 ++++++++++++++++++++--------------------
 src/page.c               |  2 +-
 src/segment.c            |  2 +-
 test/test-api.c          | 16 +++++++-------
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 01b13904..47a8a6b9 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -499,11 +499,11 @@ void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_m
 /// \{
 
 /// The maximum supported alignment size (currently 1MiB).
-#define MI_ALIGNMENT_MAX   (1024*1024UL)
+#define MI_BLOCK_ALIGNMENT_MAX   (1024*1024UL)
 
 /// Allocate \a size bytes aligned by \a alignment.
 /// @param size  number of bytes to allocate.
-/// @param alignment  the minimal alignment of the allocated memory. Must be less than #MI_ALIGNMENT_MAX.
+/// @param alignment  the minimal alignment of the allocated memory. Must be less than #MI_BLOCK_ALIGNMENT_MAX.
 /// @returns pointer to the allocated memory or \a NULL if out of memory.
 /// The returned pointer is aligned by \a alignment, i.e.
 /// `(uintptr_t)p % alignment == 0`.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 5bc49aa0..d088b305 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -193,8 +193,8 @@ typedef int32_t  mi_ssize_t;
 // Used as a special value to encode block sizes in 32 bits.
 #define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
 
-// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
 
 // ------------------------------------------------------
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 0907811e..5f60b2fc 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -33,7 +33,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 
   void* p;
   size_t oversize;
-  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {
+  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
     // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
     // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
     // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
@@ -47,7 +47,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
     p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
     // zero afterwards as only the area from the aligned_p may be committed!
-    if (p == NULL) return NULL;    
+    if (p == NULL) return NULL;
   }
   else {
     // otherwise over-allocate
@@ -73,9 +73,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
-    
+
   // now zero the block if needed
-  if (alignment > MI_ALIGNMENT_MAX) {
+  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
     // for the tracker, on huge aligned allocations only from the start of the large block is defined
     mi_track_mem_undefined(aligned_p, size);
     if (zero) {
@@ -85,7 +85,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
-  }  
+  }
   return aligned_p;
 }
 
diff --git a/src/os.c b/src/os.c
index 21ab9243..09ae367d 100644
--- a/src/os.c
+++ b/src/os.c
@@ -29,7 +29,7 @@ bool _mi_os_has_overcommit(void) {
   return mi_os_mem_config.has_overcommit;
 }
 
-bool _mi_os_has_virtual_reserve(void) { 
+bool _mi_os_has_virtual_reserve(void) {
   return mi_os_mem_config.has_virtual_reserve;
 }
 
@@ -180,7 +180,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     }
   }
   else {
-    // nothing to do 
+    // nothing to do
     mi_assert(memid.memkind < MI_MEM_OS);
   }
 }
@@ -203,25 +203,25 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   if (!commit) { allow_large = false; }
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
   *is_zero = false;
-  void* p = NULL; 
+  void* p = NULL;
   int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
   }
-  
+
   MI_UNUSED(tld_stats);
   mi_stats_t* stats = &_mi_stats_main;
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
-    if (commit) { 
-      _mi_stat_increase(&stats->committed, size); 
+    if (commit) {
+      _mi_stat_increase(&stats->committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
                else { mi_track_mem_undefined(p,size); }
       #endif
-    }    
+    }
   }
   return p;
 }
@@ -258,7 +258,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // over-allocate uncommitted (virtual) memory
       p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // set p to the aligned part in the full region
       // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
       // this is handled though by having the `base` field in the memid's
@@ -274,7 +274,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // overallocate...
       p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // and selectively unmap parts around the over-allocated area. (noop on sbrk)
       void* aligned_p = mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
@@ -285,7 +285,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
       // we can return the aligned pointer on `mmap` (and sbrk) systems
       p = aligned_p;
-      *base = aligned_p; // since we freed the pre part, `*base == p`.      
+      *base = aligned_p; // since we freed the pre part, `*base == p`.
     }
   }
 
@@ -307,7 +307,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
   if (p != NULL) {
     *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
-  }  
+  }
   return p;
 }
 
@@ -318,7 +318,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -333,7 +333,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
-  for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc
+  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
   page where the object can be aligned at an offset from the start of the segment.
   As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
   to use the actual start of the memory region.
@@ -396,7 +396,7 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;  
+  mi_stats_t* stats = &_mi_stats_main;
   if (is_zero != NULL) { *is_zero = false; }
   _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
   _mi_stat_counter_increase(&stats->commit_calls, 1);
@@ -406,21 +406,21 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
   if (csize == 0) return true;
 
-  // commit  
+  // commit
   bool os_is_zero = false;
-  int err = _mi_prim_commit(start, csize, &os_is_zero); 
+  int err = _mi_prim_commit(start, csize, &os_is_zero);
   if (err != 0) {
     _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
     return false;
   }
-  if (os_is_zero && is_zero != NULL) { 
+  if (os_is_zero && is_zero != NULL) {
     *is_zero = true;
     mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
   // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
   #ifdef MI_TRACK_ASAN
   if (os_is_zero) { mi_track_mem_defined(start,csize); }
-             else { mi_track_mem_undefined(start,csize); } 
+             else { mi_track_mem_undefined(start,csize); }
   #endif
   return true;
 }
@@ -434,11 +434,11 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   // page align
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return true; 
+  if (csize == 0) return true;
 
   // decommit
   *needs_recommit = true;
-  int err = _mi_prim_decommit(start,csize,needs_recommit);  
+  int err = _mi_prim_decommit(start,csize,needs_recommit);
   if (err != 0) {
     _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
@@ -456,7 +456,7 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { 
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
@@ -476,7 +476,7 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 }
 
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 {
@@ -489,7 +489,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   {
     bool needs_recommit = true;
     mi_os_decommit_ex(p, size, &needs_recommit, stats);
-    return needs_recommit;   
+    return needs_recommit;
   }
   else {
     if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
@@ -499,7 +499,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   }
 }
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
   return _mi_os_purge_ex(p, size, true, stats);
diff --git a/src/page.c b/src/page.c
index 912f969a..63780d63 100644
--- a/src/page.c
+++ b/src/page.c
@@ -831,7 +831,7 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Because huge pages contain just one block, and the segment contains
 // just that page, we always treat them as abandoned and any thread
 // that frees the block can free the whole page and segment directly.
-// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX).
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX).
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
   mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
diff --git a/src/segment.c b/src/segment.c
index e7e7d2cc..359815ce 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1189,7 +1189,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
 
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
   mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) {
+  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
     mi_assert_internal(_mi_is_power_of_two(page_alignment));
     mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
     //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0);
diff --git a/test/test-api.c b/test/test-api.c
index 8dd24e1b..6dd2bc7f 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)
 
 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
 
 #include "testhelper.h"
 
@@ -59,7 +59,7 @@ bool mem_is_zero(uint8_t* p, size_t size) {
 // ---------------------------------------------------------------------------
 int main(void) {
   mi_option_disable(mi_option_verbose);
-  
+
   // ---------------------------------------------------
   // Malloc
   // ---------------------------------------------------
@@ -154,7 +154,7 @@ int main(void) {
   };
   CHECK_BODY("malloc-aligned6") {
     bool ok = true;
-    for (size_t align = 1; align <= MI_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
       void* ps[8];
       for (int i = 0; i < 8 && ok; i++) {
         ps[i] = mi_malloc_aligned(align*13  // size
@@ -170,16 +170,16 @@ int main(void) {
     result = ok;
   };
   CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
     mi_free(p);
-    result = ((uintptr_t)p % MI_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
   };
   CHECK_BODY("malloc-aligned8") {
     bool ok = true;
     for (int i = 0; i < 5 && ok; i++) {
       int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
+      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
       mi_free(p);
     }
     result = ok;
@@ -187,7 +187,7 @@ int main(void) {
   CHECK_BODY("malloc-aligned9") {
     bool ok = true;
     void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_ALIGNMENT_MAX, MI_ALIGNMENT_MAX + 1, 2 * MI_ALIGNMENT_MAX, 8 * MI_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
     for (int i = 0; i < 28 && ok; i++) {
       int align = (1 << i);
       for (int j = 0; j < 8 && ok; j++) {

From b5665f0eec417a73a8abcdb00f3a95165b165527 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:17:17 -0700
Subject: [PATCH 16/28] add full block_size and page_start to page info

---
 ide/vs2022/mimalloc.vcxproj         |  6 +++
 ide/vs2022/mimalloc.vcxproj.filters |  3 ++
 include/mimalloc/internal.h         | 21 +++-----
 include/mimalloc/types.h            | 31 +++++------
 src/alloc.c                         | 10 ++--
 src/free.c                          | 16 +-----
 src/heap.c                          | 14 ++---
 src/init.c                          |  9 ++--
 src/page-queue.c                    | 30 ++++++-----
 src/page.c                          | 83 ++++++++++++-----------------
 src/segment.c                       | 37 +++++++------
 src/stats.c                         |  7 +--
 12 files changed, 119 insertions(+), 148 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 11da11c3..3e11d0fe 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -217,6 +217,12 @@
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\libc.c" />
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index bb5c8ce9..a387f5a5 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,6 +58,9 @@
     <ClCompile Include="..\..\src\libc.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 02200594..21dc9d62 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,7 +147,7 @@ void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size); // page start for any page
 
 #if MI_HUGE_PAGE_ABANDON
 void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@@ -452,10 +452,9 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 }
 
 // Quick page start for initialized pages
-static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
-  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert_internal(page->page_start != NULL);
+  return page->page_start;
 }
 
 // Get the page containing the pointer
@@ -466,16 +465,8 @@ static inline mi_page_t* _mi_ptr_page(void* p) {
 
 // Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0);
-  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
-    return bsize;
-  }
-  else {
-    size_t psize;
-    _mi_segment_page_start(_mi_page_segment(page), page, bsize, &psize, NULL);
-    return psize;
-  }
+  mi_assert_internal(page->block_size > 0);
+  return page->block_size;  
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d088b305..69d59527 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -181,7 +181,6 @@ typedef int32_t  mi_ssize_t;
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
 #define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
@@ -190,9 +189,6 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif
 
-// Used as a special value to encode block sizes in 32 bits.
-#define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
-
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
@@ -258,7 +254,6 @@ typedef uintptr_t mi_thread_free_t;
 // implement a monotonic heartbeat. The `thread_free` list is needed for
 // avoiding atomic operations in the common case.
 //
-//
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
@@ -266,16 +261,13 @@ typedef uintptr_t mi_thread_free_t;
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
 // Notes:
-// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 8 words on 64-bit which helps the page index calculations
-//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
-//    and 12 are still good for address calculation)
-// - To limit the structure size, the `xblock_size` is 32-bits only; for
-//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 14 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
 //   at least one block that will be added, or as already been added, to
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
@@ -290,16 +282,16 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-
+                                           // padding
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               block_offset_adj;  // if not zero, then `(mi_page_start(_,page,_) - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(block_offset_adj-1)) % block_size == 0)` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  size_t                block_size;        // size available in each block (always `>0`)
+  uint8_t*              page_start;        // start of the page area containing the blocks
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
@@ -310,6 +302,10 @@ typedef struct mi_page_s {
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+
+  #if MI_INTPTR_SIZE==4                    // pad to 14 words on 32-bit
+  void* padding[1];
+  #endif
 } mi_page_t;
 
 
@@ -548,7 +544,6 @@ typedef struct mi_stats_s {
   mi_stat_counter_t searches;
   mi_stat_counter_t normal_count;
   mi_stat_counter_t huge_count;
-  mi_stat_counter_t giant_count;
   mi_stat_counter_t arena_count;
   mi_stat_counter_t arena_crossover_count;
   mi_stat_counter_t arena_rollback_count;
diff --git a/src/alloc.c b/src/alloc.c
index 3a38a226..8b6c4de0 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
 {
-  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
     return _mi_malloc_generic(heap, size, zero, 0);
@@ -53,14 +53,14 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 
   // zero the block? note: we need to zero the full block size (issue #63)
   if mi_unlikely(zero) {
-    mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
-    mi_assert_internal(page->xblock_size >= MI_PADDING_SIZE);
+    mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    mi_assert_internal(page->block_size >= MI_PADDING_SIZE);
     if (page->free_is_zero) {
       block->next = 0;
-      mi_track_mem_defined(block, page->xblock_size - MI_PADDING_SIZE);
+      mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE);
     }
     else {
-      _mi_memzero_aligned(block, page->xblock_size - MI_PADDING_SIZE);
+      _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE);
     }    
   }
 
diff --git a/src/free.c b/src/free.c
index 0e560e53..c66de6f6 100644
--- a/src/free.c
+++ b/src/free.c
@@ -55,14 +55,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
-  size_t diff;
-  if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - (MI_MAX_ALIGN_SIZE*(page->block_offset_adj - 1));
-  }
-  else {
-    diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  }
-
+  size_t diff = (uint8_t*)p - page->page_start;
   size_t adjust;
   if mi_likely(page->block_size_shift != 0) {
     adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@@ -519,12 +512,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 #if !MI_HUGE_PAGE_ABANDON
   else {
     const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
+    mi_heap_stat_decrease(heap, huge, bpsize);
   }
 #endif
 }
diff --git a/src/heap.c b/src/heap.c
index 18cfc706..21cdfa46 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -289,12 +289,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   // stats
   const size_t bsize = mi_page_block_size(page);
   if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, giant, bsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, huge, bsize);
-    }
+    mi_heap_stat_decrease(heap, huge, bsize);
   }
 #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
@@ -467,8 +462,7 @@ static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   MI_UNUSED(heap);
   MI_UNUSED(pq);
   bool* found = (bool*)vfound;
-  mi_segment_t* segment = _mi_page_segment(page);
-  void* start = _mi_page_start(segment, page, NULL);
+  void* start = mi_page_start(page);
   void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
   *found = (p >= start && p < end);
   return (!*found); // continue if not found
@@ -514,7 +508,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   const size_t bsize = mi_page_block_size(page);
   const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
-  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
 
   if (page->capacity == 1) {
     // optimize page with one block
@@ -581,7 +575,7 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   xarea.page = page;
   xarea.area.reserved = page->reserved * bsize;
   xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.blocks = mi_page_start(page);
   xarea.area.used = page->used;   // number of blocks in use (#553)
   xarea.area.block_size = ubsize;
   xarea.area.full_block_size = bsize;
diff --git a/src/init.c b/src/init.c
index 8a20daca..79175f81 100644
--- a/src/init.c
+++ b/src/init.c
@@ -17,15 +17,15 @@ const mi_page_t _mi_page_empty = {
   0, false, false, false,
   0,       // capacity
   0,       // reserved capacity
+  0,       // used
   { 0 },   // flags
+  0,       // block size shift
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
   NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // block offset adj
-  0,       // xblock_size
+  0,       // block_size
+  NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
@@ -78,7 +78,6 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 } \
diff --git a/src/page-queue.c b/src/page-queue.c
index fbfe2811..095f9b81 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -11,6 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
+// include to help an IDE
+#include "mimalloc.h"     
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 #endif
 
 /* -----------------------------------------------------------
@@ -138,20 +142,20 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif
 
 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
   mi_heap_t* heap = mi_page_heap(page);
   mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
+  mi_assert_internal(bin >= MI_BIN_HUGE || mi_page_block_size(page) == pq->block_size);
   mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
   mi_assert_internal(bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  mi_assert_internal(mi_page_is_in_full(page) || mi_page_block_size(page) == pq->block_size);
   return pq;
 }
 
@@ -206,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
@@ -231,8 +235,8 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  mi_assert_internal(page->xblock_size == queue->block_size ||
-                      (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -258,11 +262,13 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
-  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
-                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
-                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+  const size_t bsize = mi_page_block_size(page);
+  MI_UNUSED(bsize);
+  mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
+                     (bsize == to->block_size && mi_page_queue_is_full(from)) ||
+                     (bsize == from->block_size && mi_page_queue_is_full(to)) ||
+                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
diff --git a/src/page.c b/src/page.c
index 63780d63..ef58d883 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -78,14 +78,14 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
 }
 
 static bool mi_page_is_valid_init(mi_page_t* page) {
-  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
   const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,bsize,NULL,NULL));
+  uint8_t* start = mi_page_start(page);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -283,10 +283,9 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  mi_assert_internal(pq!=NULL || page->xblock_size != 0);
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   // a fresh page was found, initialize it
-  const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
   mi_assert_internal(full_block_size >= block_size);
   mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
@@ -425,7 +424,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
@@ -448,10 +447,12 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
+  const size_t bsize = mi_page_block_size(page);
+  if mi_likely(bsize < MI_MAX_RETIRE_SIZE) {  // not too large && not full or huge queue?
+    mi_assert_internal(!mi_page_queue_is_special(pq));
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -514,7 +515,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  void* const page_area = mi_page_start(page);
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -572,7 +573,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  void* const page_area = mi_page_start(page);
 
   mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
 
@@ -616,15 +617,15 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size, NULL);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
-  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  const size_t bsize = mi_page_block_size(page);
   size_t extend = page->reserved - page->capacity;
   mi_assert_internal(extend > 0);
 
-  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
   if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
   mi_assert_internal(max_extend > 0);
 
@@ -658,10 +659,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(block_size > 0);
   // set fields
   mi_page_set_heap(page, heap);
+  page->block_size = block_size;
   size_t page_size;
-  const void*  page_start = _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
-  mi_track_mem_noaccess(page_start,page_size);
-  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
+  page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
+  mi_track_mem_noaccess(page->page_start,page_size);  
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -673,20 +674,14 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   #if MI_DEBUG>2
   if (page->is_zero_init) {
     mi_track_mem_defined(page_start, page_size);
-    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
+    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page->page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
-  if (block_size > 0) {
-    const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
-    const ptrdiff_t start_adjust = start_offset % block_size;
-    if (start_offset >= 0 && (start_adjust % MI_MAX_ALIGN_SIZE) == 0 && (start_adjust / MI_MAX_ALIGN_SIZE) < 255) {
-      const ptrdiff_t adjust = (start_adjust / MI_MAX_ALIGN_SIZE);
-      mi_assert_internal(adjust + 1 == (uint8_t)(adjust + 1));
-      page->block_offset_adj = (uint8_t)(adjust + 1);
-    }
+  else {
+    page->block_size_shift = 0;
   }
 
   mi_assert_internal(page->capacity == 0);
@@ -701,8 +696,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
-  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(page->block_offset_adj-1))) % block_size == 0);
+  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
@@ -827,40 +821,31 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// A huge page is allocated directly without being in a queue.
-// Because huge pages contain just one block, and the segment contains
-// just that page, we always treat them as abandoned and any thread
-// that frees the block can free the whole page and segment directly.
-// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX).
+// Huge pages contain just one block, and the segment contains just that page. 
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
+// so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
   mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_HUGE_OBJ_SIZE_MAX); // not block_size as that can be low if the page_alignment > 0
-  mi_assert_internal(mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, block_size); 
+  // mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size,page_alignment);
-  if (page != NULL) {
-    const size_t bsize = mi_page_block_size(page);  // note: not `mi_page_usable_block_size` as `size` includes padding already
-    mi_assert_internal(bsize >= size);
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
+  if (page != NULL) {    
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
+    mi_assert_internal(mi_page_is_huge(page));
     mi_assert_internal(_mi_page_segment(page)->used==1);
     #if MI_HUGE_PAGE_ABANDON
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
     mi_page_set_heap(page, NULL);
     #endif
-
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, giant, bsize);
-      mi_heap_stat_counter_increase(heap, giant_count, 1);
-    }
-    else {
-      mi_heap_stat_increase(heap, huge, bsize);
-      mi_heap_stat_counter_increase(heap, huge_count, 1);
-    }
+    mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
+    mi_heap_stat_counter_increase(heap, huge_count, 1);
   }
   return page;
 }
@@ -927,7 +912,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_page_block_size(page) >= size);
 
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
-  if mi_unlikely(zero && page->xblock_size == 0) {
+  if mi_unlikely(zero && page->block_size == 0) {
     // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
     void* p = _mi_page_malloc(heap, page, size, false);
     mi_assert_internal(p != NULL);
diff --git a/src/segment.c b/src/segment.c
index 359815ce..a4fd26e9 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -412,13 +412,13 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 #endif
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size == 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
 // Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
 {
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
@@ -437,11 +437,15 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
   }
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size==0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size) {
+  return mi_segment_page_start_ex(segment, page, mi_page_block_size(page), page_size, pre_size);
+}
+
 static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
 {
   const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
@@ -707,15 +711,19 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_seg
   page->is_zero_init = false;
   page->segment_in_use = false;
 
-  // zero the page data, but not the segment fields and capacity, and block_size (for page size calculations)
-  uint32_t block_size = page->xblock_size;
+  // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations)
+  size_t block_size = page->block_size;
+  uint8_t block_size_shift = page->block_size_shift;
+  uint8_t* page_start = page->page_start;
   uint16_t capacity = page->capacity;
   uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
   page->capacity = capacity;
   page->reserved = reserved;
-  page->xblock_size = block_size;
+  page->block_size = block_size;
+  page->block_size_shift = block_size_shift;
+  page->page_start = page_start;
   segment->used--;
 
   // schedule purge
@@ -831,7 +839,6 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 // Possibly clear pages and check if free space is available
 static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   bool has_page = false;
   size_t pages_used = 0;
   size_t pages_used_empty = 0;
@@ -847,7 +854,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
         pages_used_empty++;
         has_page = true;
       }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+      else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
         has_page = true;
       }
@@ -901,7 +908,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       else {
         // otherwise reclaim it into the heap
         _mi_page_reclaim(heap, page);
-        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page)) {
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
@@ -1008,7 +1015,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
   mi_assert_internal(page_kind <= MI_PAGE_LARGE);
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
 
   // 1. try to reclaim an abandoned segment
   bool reclaimed;
@@ -1077,7 +1084,7 @@ static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_p
   mi_assert_internal(page != NULL);
   #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
   // verify it is committed
-  _mi_segment_page_start(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_page_start_ex(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
   #endif
   return page;
 }
@@ -1100,7 +1107,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  _mi_segment_page_start(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_page_start_ex(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
 #endif
   return page;
 }
@@ -1117,11 +1124,11 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 
-  // for huge pages we initialize the xblock_size as we may
+  // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, 0, &psize, NULL);
-  page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
+  uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
+  page->block_size = psize;
 
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
diff --git a/src/stats.c b/src/stats.c
index 8fbdfc45..5dfd713b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -117,8 +117,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
-  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
@@ -316,12 +315,10 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   #if MI_STAT
   mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
-  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
   mi_stat_count_t total = { 0,0,0,0 };
   mi_stat_add(&total, &stats->normal, 1);
   mi_stat_add(&total, &stats->huge, 1);
-  mi_stat_add(&total, &stats->giant, 1);
   mi_stat_print(&total, "total", 1, out, arg);
   #endif
   #if MI_STAT>1

From a8a53e3e85fbe8c8f997078399ee089880614ebf Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:50:15 -0700
Subject: [PATCH 17/28] fix double counting of free-ing for non-thread-local
 free calls

---
 include/mimalloc/internal.h |  2 +-
 src/alloc-aligned.c         |  2 +-
 src/free.c                  | 27 ++++++++++++++-------------
 src/options.c               |  2 +-
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 21dc9d62..29943357 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -200,7 +200,7 @@ void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool
 void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
 void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 5f60b2fc..b63c5e43 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -69,7 +69,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   // todo: expand padding if overallocated ?
 
   mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
diff --git a/src/free.c b/src/free.c
index c66de6f6..39443ccf 100644
--- a/src/free.c
+++ b/src/free.c
@@ -29,17 +29,17 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
-static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
 {
   // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  mi_stat_free(page, block);
+  if (track_stats) { mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-
+  if (track_stats) { mi_track_free_size(p, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  
   // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
@@ -52,7 +52,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
   size_t diff = (uint8_t*)p - page->page_start;
@@ -69,13 +69,14 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
 // free a local pointer  (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true);
+  MI_UNUSED(segment);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
+  mi_free_block_local(page, block, true, true);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
@@ -135,7 +136,7 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, true, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
@@ -170,7 +171,7 @@ bool _mi_free_delayed_block(mi_block_t* block) {
   _mi_page_free_collect(page, false);
 
   // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, true);
+  mi_free_block_local(page, block, false /* stats have already been adjusted */, true);
   return true;
 }
 
@@ -287,8 +288,8 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // ------------------------------------------------------
 
 // Bytes available in a block
-static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
-  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(page, p);
   const size_t size = mi_page_usable_size_of(page, block);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
   mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
@@ -305,7 +306,7 @@ static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noe
   }
   else {
     // split out to separate routine for improved code generation
-    return mi_page_usable_aligned_size_of(segment, page, p);
+    return mi_page_usable_aligned_size_of(page, p);
   }
 }
 
diff --git a/src/options.c b/src/options.c
index f8e928d0..8a84d344 100644
--- a/src/options.c
+++ b/src/options.c
@@ -91,7 +91,7 @@ static mi_option_desc_t options[_mi_option_last] =
 
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 9c96d05ee435a8931c685f33cc115f58765e530b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:52:50 -0700
Subject: [PATCH 18/28] abandoned reclaim on free is on by default

---
 src/free.c    | 6 +++---
 src/options.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/free.c b/src/free.c
index 39443ccf..87847f21 100644
--- a/src/free.c
+++ b/src/free.c
@@ -25,7 +25,7 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // ------------------------------------------------------
 
 // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
@@ -77,7 +77,7 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_free_block_mt(segment, page, block);
+  mi_free_block_mt(page, segment, block);
 }
 
 // generic free (for runtime integration)
@@ -230,7 +230,7 @@ static void mi_stat_huge_free(const mi_page_t* page);
 #endif
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
   if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
diff --git a/src/options.c b/src/options.c
index 8a84d344..f8e928d0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -91,7 +91,7 @@ static mi_option_desc_t options[_mi_option_last] =
 
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 006ae2d055ea6a7d847621963dc85a8c39423fa7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 24 Mar 2024 17:07:28 -0700
Subject: [PATCH 19/28] add is_huge page flag to ensure the right page queue is
 returned (see #868)

---
 include/mimalloc/internal.h |  4 +++-
 include/mimalloc/types.h    |  5 +++--
 src/alloc.c                 |  2 +-
 src/init.c                  |  8 +++++---
 src/page-queue.c            | 34 ++++++++++++++++++----------------
 src/page.c                  | 19 +++++++++----------
 src/segment.c               |  9 +++++++--
 7 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 29943357..4df8ca68 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -470,7 +470,9 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
+  return page->is_huge;
 }
 
 // Get the usable block size of a page without fixed padding.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 69d59527..6b22c83e 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -278,6 +278,7 @@ typedef struct mi_page_s {
   uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
   uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
   uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
 
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
@@ -285,7 +286,7 @@ typedef struct mi_page_s {
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized 
   uint8_t               retire_expire:7;   // expiration count for retired blocks
                                            // padding
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
diff --git a/src/alloc.c b/src/alloc.c
index 8b6c4de0..2e03eca0 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
diff --git a/src/init.c b/src/init.c
index 79175f81..604809ad 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,7 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false,
+  0, 
+  false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
   0,       // used
@@ -78,9 +79,10 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/page-queue.c b/src/page-queue.c
index 095f9b81..e4bfde14 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -141,21 +141,21 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
-static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  mi_assert_internal(heap!=NULL);
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
+  mi_assert_internal(bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || mi_page_block_size(page) == pq->block_size);
-  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
+                         (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
   return pq;
 }
 
-static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
-  mi_assert_internal(bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || mi_page_block_size(page) == pq->block_size);
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -210,7 +210,9 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
@@ -236,7 +238,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -267,8 +269,8 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
                      (bsize == to->block_size && mi_page_queue_is_full(from)) ||
                      (bsize == from->block_size && mi_page_queue_is_full(to)) ||
-                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
diff --git a/src/page.c b/src/page.c
index ef58d883..d36421f4 100644
--- a/src/page.c
+++ b/src/page.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
-  const size_t bsize = mi_page_block_size(page);
+  // const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
   mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
@@ -448,8 +448,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
   const size_t bsize = mi_page_block_size(page);
-  if mi_likely(bsize < MI_MAX_RETIRE_SIZE) {  // not too large && not full or huge queue?
-    mi_assert_internal(!mi_page_queue_is_special(pq));
+  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
       page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
@@ -662,7 +661,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->block_size = block_size;
   size_t page_size;
   page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
-  mi_track_mem_noaccess(page->page_start,page_size);  
+  mi_track_mem_noaccess(page->page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -821,7 +820,7 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// Huge pages contain just one block, and the segment contains just that page. 
+// Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
@@ -830,15 +829,15 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, block_size); 
-  // mi_assert_internal(mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
-  if (page != NULL) {    
+  if (page != NULL) {
     mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
     mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
     mi_assert_internal(_mi_page_segment(page)->used==1);
     #if MI_HUGE_PAGE_ABANDON
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
diff --git a/src/segment.c b/src/segment.c
index a4fd26e9..cec3079e 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -142,6 +142,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1);
   size_t nfree = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
@@ -151,6 +152,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
     if (page->segment_in_use) {
       mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
+    if (segment->page_kind == MI_PAGE_HUGE) mi_assert_internal(page->is_huge);
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
   // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
@@ -615,11 +617,13 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
 
   // initialize pages info
+  const bool is_huge = (page_kind == MI_PAGE_HUGE);
   for (size_t i = 0; i < capacity; i++) {
     mi_assert_internal(i <= 255);
     segment->pages[i].segment_idx = (uint8_t)i;
     segment->pages[i].is_committed = segment->memid.initially_committed;
     segment->pages[i].is_zero_init = segment->memid.initially_zero;
+    segment->pages[i].is_huge = is_huge;
   }
 
   // initialize
@@ -753,7 +757,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
       mi_segment_abandon(segment,tld);
     }
     else if (segment->used + 1 == segment->capacity) {
-      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
+      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment
       if (segment->page_kind <= MI_PAGE_MEDIUM) {
         // move back to segments  free list
         mi_segment_insert_in_free_queue(segment,tld);
@@ -1123,13 +1127,14 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   #endif
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
+  mi_assert_internal(page->is_huge);
 
   // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
   uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
   page->block_size = psize;
-
+  
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);

From af3f2f9168fc0f2345c23d8c8b34a73563935834 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 22:32:32 -0700
Subject: [PATCH 20/28] fix assertions and warnings on gcc

---
 src/page-queue.c | 2 +-
 src/page.c       | 4 ++--
 src/segment.c    | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/page-queue.c b/src/page-queue.c
index 751caf95..470d1b64 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -141,7 +141,7 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
-static inline bool mi_page_is_large_or_huge(mi_page_t* page) {
+static inline bool mi_page_is_large_or_huge(const mi_page_t* page) {
   return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page));
 }
 
diff --git a/src/page.c b/src/page.c
index 808d863c..6bd53296 100644
--- a/src/page.c
+++ b/src/page.c
@@ -428,7 +428,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
+#define MI_MAX_RETIRE_SIZE    MI_MEDIUM_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
@@ -455,7 +455,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = 1+(bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
diff --git a/src/segment.c b/src/segment.c
index 2d29a5f3..e7843f37 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -601,7 +601,7 @@ static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t*
 ----------------------------------------------------------- */
 
 static bool mi_segment_is_abandoned(mi_segment_t* segment) {
-  return (segment->thread_id == 0);
+  return (mi_atomic_load_relaxed(&segment->thread_id) == 0);
 }
 
 // note: can be called on abandoned segments
@@ -654,8 +654,8 @@ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld
 // note: can be called on abandoned segments
 static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
   mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
-  mi_segment_t* segment = _mi_ptr_segment(slice);
-  bool is_abandoned = mi_segment_is_abandoned(segment);
+  mi_segment_t* const segment = _mi_ptr_segment(slice);
+  const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment);
 
   // for huge pages, just mark as free but don't add to the queues
   if (segment->kind == MI_SEGMENT_HUGE) {

From 7c17c3d33ed03a5cc19144cb99e3a8030b1c7cdf Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 22:41:33 -0700
Subject: [PATCH 21/28] optimize page struct layout

---
 include/mimalloc/types.h | 7 ++++---
 src/init.c               | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index b8e7f97d..a08555ee 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -292,14 +292,15 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
+
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+                                           // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
 
diff --git a/src/init.c b/src/init.c
index 1a6a30e5..33161062 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,17 +14,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, 
+  0,
   false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
-  0,       // used
   { 0 },   // flags
-  0,       // block size shift
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
   NULL,    // local_free
+  0,       // used
+  0,       // block size shift
   0,       // block_size
   NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)

From cc8d89a08528500572390a648a874bc705bd91b2 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Mon, 25 Mar 2024 07:35:49 -0700
Subject: [PATCH 22/28] update comments

---
 include/mimalloc/types.h | 57 ++++++++++++++++++++++++----------------
 src/page.c               |  2 +-
 src/segment.c            | 20 +++++++-------
 3 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index a08555ee..4e96c5ec 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -13,9 +13,12 @@ terms of the MIT license. A copy of the license can be found in the file
 // mi_heap_t      : all data for a thread-local heap, contains
 //                  lists of all managed heap pages.
 // mi_segment_t   : a larger chunk of memory (32GiB) from where pages
-//                  are allocated.
-// mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
+//                  are allocated. A segment is divided in slices (64KiB) from
+//                  which pages are allocated.
+// mi_page_t      : a "mimalloc" page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
+//                  Note: we always explicitly use "OS page" to refer to OS pages
+//                  and just use "page" to refer to mimalloc pages (`mi_page_t`)
 // --------------------------------------------------------------------------
 
 
@@ -192,15 +195,15 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif
 
-// Maximum slice offset (15)
-#define MI_MAX_SLICE_OFFSET               ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
-
 // blocks up to this size are always allocated aligned
 #define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)
 
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX            (MI_SEGMENT_SIZE >> 1)
 
+// Maximum slice count (255) for which we can find the page for interior pointers
+#define MI_MAX_SLICE_OFFSET_COUNT         ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+
 
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@@ -285,9 +288,9 @@ typedef struct mi_page_s {
   // "owned" by the segment
   uint32_t              slice_count;       // slices in this page (0 if not a page)
   uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
-  uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init : 1;  // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
+  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`)
                                            // padding
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
@@ -328,12 +331,13 @@ typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
   MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
   MI_PAGE_LARGE,    // larger blocks go into a page of just one block
-  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+  MI_PAGE_HUGE,     // huge blocks (> `MI_LARGE_OBJ_SIZE_MAX) or with alignment `> MI_BLOCK_ALIGNMENT_MAX`
+                    // are put into a single page in a single `MI_SEGMENT_HUGE` segment.
 } mi_page_kind_t;
 
 typedef enum mi_segment_kind_e {
   MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
-  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+  MI_SEGMENT_HUGE,   // segment with just one huge page inside.
 } mi_segment_kind_t;
 
 // ------------------------------------------------------
@@ -404,39 +408,48 @@ typedef struct mi_memid_s {
 } mi_memid_t;
 
 
-// Segments are large allocated memory blocks (8mb on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
+// Segments are large allocated memory blocks (8mb on 64 bit) from arenas or the OS.
+//
+// Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks.
+// The start of a segment is this structure with a fixed number of slice entries (`slices`)
+// usually followed by a guard OS page and the actual allocation area with pages.
+// While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`).
+// Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent
+// slices part of the area, the `slice_offset` is the byte offset back to the first slice
+// (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`).
+// For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`).
+// Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while
+// large and huge pages span a variable amount of slices.
 typedef struct mi_segment_s {
   // constant fields
-  mi_memid_t        memid;              // memory id for arena allocation
-  bool              allow_decommit;
-  bool              allow_purge;
+  mi_memid_t        memid;              // memory id for arena/OS allocation
+  bool              allow_decommit;     // can we decommmit the memory
+  bool              allow_purge;        // can we purge the memory (reset or decommit)
   size_t            segment_size;
 
   // segment fields
-  mi_msecs_t        purge_expire;
-  mi_commit_mask_t  purge_mask;
-  mi_commit_mask_t  commit_mask;
+  mi_msecs_t        purge_expire;       // purge slices in the `purge_mask` after this time
+  mi_commit_mask_t  purge_mask;         // slices that can be purged
+  mi_commit_mask_t  commit_mask;        // slices that are currently committed
 
   // from here is zero initialized
   struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
   bool              was_reclaimed;      // true if it was reclaimed (used to limit on-free reclamation)
 
   size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            abandoned_visits;   // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long)
   size_t            used;               // count of pages in use
   uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
 
   size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
-  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+  size_t            segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages.
 
   // layout like this to optimize access in `mi_free`
   mi_segment_kind_t kind;
   size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
 
-  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one more for huge blocks with large alignment
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one extra final entry for huge blocks with large alignment
 } mi_segment_t;
 
 
diff --git a/src/page.c b/src/page.c
index 6bd53296..05de541a 100644
--- a/src/page.c
+++ b/src/page.c
@@ -455,7 +455,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = 1+(bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
diff --git a/src/segment.c b/src/segment.c
index e7843f37..1d2f1e47 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -11,7 +11,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memset
 #include <stdio.h>
 
-#define MI_PAGE_HUGE_ALIGN   (256*1024)
+// -------------------------------------------------------------------
+// Segments
+// mimalloc pages reside in segments. See `mi_segment_valid` for invariants.
+// -------------------------------------------------------------------
+
 
 static void mi_segment_try_purge(mi_segment_t* segment, bool force, mi_stats_t* stats);
 
@@ -146,10 +150,6 @@ size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
 
 /* --------------------------------------------------------------------------------
   Segment allocation
-
-  If a  thread ends, it "abandons" pages with used blocks
-  and there is an abandoned segment list whose segments can
-  be reclaimed by still running threads, much like work-stealing.
 -------------------------------------------------------------------------------- */
 
 
@@ -268,10 +268,10 @@ static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
     mi_assert_internal(slice->slice_offset == 0);
     size_t index = mi_slice_index(slice);
     size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
-    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
+    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET_COUNT valid back offsets
       used_count++;
-      if (segment->kind == MI_SEGMENT_HUGE) { mi_assert_internal(slice->is_huge); }
-      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
+      mi_assert_internal(slice->is_huge == (segment->kind == MI_SEGMENT_HUGE));
+      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET_COUNT && index + i <= maxindex; i++) {
         mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
         mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
         mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1);
@@ -720,9 +720,9 @@ static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_i
   mi_page_t*  page = mi_slice_to_page(slice);
   mi_assert_internal(mi_page_block_size(page) == bsize);
 
-  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
+  // set slice back pointers for the first MI_MAX_SLICE_OFFSET_COUNT entries
   size_t extra = slice_count-1;
-  if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
+  if (extra > MI_MAX_SLICE_OFFSET_COUNT) extra = MI_MAX_SLICE_OFFSET_COUNT;
   if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
 
   mi_slice_t* slice_next = slice + 1;

From 00228021776780326de4a99e540eadc002909cda Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Mon, 25 Mar 2024 15:25:04 -0700
Subject: [PATCH 23/28] commend and make at least 8 tries for reclaim

---
 include/mimalloc/internal.h |  5 ++--
 include/mimalloc/types.h    | 53 +++++++++++++++++++++++--------------
 src/heap.c                  |  2 +-
 src/init.c                  |  6 ++---
 src/libc.c                  |  4 +--
 src/page.c                  |  9 ++++---
 src/segment.c               | 51 ++++++++++++++++++-----------------
 src/stats.c                 | 46 +++++++++++++++-----------------
 8 files changed, 96 insertions(+), 80 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 4df8ca68..12436ca4 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,7 +147,7 @@ void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
 
 #if MI_HUGE_PAGE_ABANDON
 void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@@ -454,6 +454,7 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 // Quick page start for initialized pages
 static inline uint8_t* mi_page_start(const mi_page_t* page) {
   mi_assert_internal(page->page_start != NULL);
+  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
   return page->page_start;
 }
 
@@ -466,7 +467,7 @@ static inline mi_page_t* _mi_ptr_page(void* p) {
 // Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
-  return page->block_size;  
+  return page->block_size;
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6b22c83e..adfd7838 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -16,6 +16,8 @@ terms of the MIT license. A copy of the license can be found in the file
 //                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
 
 
@@ -89,10 +91,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
-// but that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
-// another thread so most memory is available until it gets properly freed by the owning thread.
+// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// the owning thread).
 // #define MI_HUGE_PAGE_ABANDON 1
 
 
@@ -160,7 +163,7 @@ typedef int32_t  mi_ssize_t;
 #define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
 #define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
 #define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
 
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
@@ -215,7 +218,7 @@ typedef enum mi_delayed_e {
   MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
   MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
   MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;
 
 
@@ -264,7 +267,7 @@ typedef uintptr_t mi_thread_free_t;
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
 // - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 14 words on 32-bit, and encoded free lists add 2 words)
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@@ -283,14 +286,15 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized 
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
+
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+                                           // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
 
@@ -304,7 +308,7 @@ typedef struct mi_page_s {
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 
-  #if MI_INTPTR_SIZE==4                    // pad to 14 words on 32-bit
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
   void* padding[1];
   #endif
 } mi_page_t;
@@ -319,17 +323,22 @@ typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
   MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // huge blocks (>512KiB) are put into a single page in a segment of the exact size (but still 2MiB aligned)
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
 
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
 // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
 typedef enum mi_memkind_e {
   MI_MEM_NONE,      // not allocated
   MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
   MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
   MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge os pages
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
   MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
   MI_MEM_ARENA      // allocated from an arena (the usual case)
 } mi_memkind_t;
@@ -346,7 +355,7 @@ typedef struct mi_memid_os_info {
 typedef struct mi_memid_arena_info {
   size_t        block_index;        // index in the arena
   mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // the arena can only be used for specific arena allocations
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
 
 typedef struct mi_memid_s {
@@ -354,19 +363,22 @@ typedef struct mi_memid_s {
     mi_memid_os_info_t    os;       // only used for MI_MEM_OS
     mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
   } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
   bool          initially_committed;// `true` if the memory was originally allocated as committed
   bool          initially_zero;     // `true` if the memory was originally zero initialized
   mi_memkind_t  memkind;
 } mi_memid_t;
 
 
-// Segments are large allocated memory blocks (2MiB on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
+
+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS. 
+// Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
   // constant fields
-  mi_memid_t           memid;            // id for the os-level memory manager
+  mi_memid_t           memid;            // memory id to track provenance
   bool                 allow_decommit;
   bool                 allow_purge;
   size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
@@ -572,6 +584,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
diff --git a/src/heap.c b/src/heap.c
index 21cdfa46..bcb5a41c 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -508,7 +508,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   const size_t bsize = mi_page_block_size(page);
   const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
-  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
+  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
 
   if (page->capacity == 1) {
     // optimize page with one block
diff --git a/src/init.c b/src/init.c
index 604809ad..f5fa2ad8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,17 +14,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, 
+  0,
   false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
-  0,       // used
   { 0 },   // flags
-  0,       // block size shift
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
   NULL,    // local_free
+  0,       // used
+  0,       // block size shift
   0,       // block_size
   NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
diff --git a/src/libc.c b/src/libc.c
index f1412722..dd6b4007 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -210,7 +210,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
         if (c == 'x' || c == 'u') {
           if (numtype == 'z')       x = va_arg(args, size_t);
           else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
-          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
                                else x = va_arg(args, unsigned long);
         }
         else if (c == 'p') {
@@ -231,7 +231,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
         intptr_t x = 0;
         if (numtype == 'z')       x = va_arg(args, intptr_t );
         else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
-        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
                              else x = va_arg(args, long);
         char pre = 0;
         if (x < 0) {
diff --git a/src/page.c b/src/page.c
index d36421f4..7a333cb4 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
+  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -85,7 +85,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   // const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -616,7 +617,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size, NULL);
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
@@ -660,7 +661,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_page_set_heap(page, heap);
   page->block_size = block_size;
   size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
+  page->page_start = _mi_segment_page_start(segment, page, &page_size);
   mi_track_mem_noaccess(page->page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
diff --git a/src/segment.c b/src/segment.c
index cec3079e..ff20b504 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -25,14 +25,15 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
   - small pages (64KiB), 64 in one segment
   - medium pages (512KiB), 8 in one segment
   - large pages (4MiB), 1 in one segment
-  - huge blocks > MI_LARGE_OBJ_SIZE_MAX become large segment with 1 page
+  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
+    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 
-  In any case the memory for a segment is virtual and usually committed on demand.
+  The memory for a segment is usually committed on demand.
   (i.e. we are careful to not touch the memory until we actually allocate a block there)
 
-  If a  thread ends, it "abandons" pages with used blocks
-  and there is an abandoned segment list whose segments can
-  be reclaimed by still running threads, much like work-stealing.
+  If a  thread ends, it "abandons" pages that still contain live blocks.
+  Such segments are abondoned and these can be reclaimed by still running threads,
+  (much like work-stealing).
 -------------------------------------------------------------------------------- */
 
 
@@ -142,7 +143,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment
   size_t nfree = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
@@ -152,7 +153,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
     if (page->segment_in_use) {
       mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
-    if (segment->page_kind == MI_PAGE_HUGE) mi_assert_internal(page->is_huge);
+    mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
   // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
@@ -420,11 +421,11 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 }
 
 // Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
-  if (pre_size != NULL) *pre_size = 0;
+  const size_t block_size = mi_page_block_size(page);
   if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
     // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
     size_t adjust = block_size - ((uintptr_t)p % block_size);
@@ -432,7 +433,7 @@ static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_p
       if (adjust < block_size) {
         p += adjust;
         psize -= adjust;
-        if (pre_size != NULL) *pre_size = adjust;
+        // if (pre_size != NULL) *pre_size = adjust;
       }
       mi_assert_internal((uintptr_t)p % block_size == 0);
     }
@@ -444,9 +445,6 @@ static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_p
   return p;
 }
 
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size) {
-  return mi_segment_page_start_ex(segment, page, mi_page_block_size(page), page_size, pre_size);
-}
 
 static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
 {
@@ -961,26 +959,31 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
 }
 
 static long mi_segment_get_reclaim_tries(void) {
-  // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
+  // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return 0;
   const size_t total_count = _mi_arena_segment_abandoned_count();
+  if (total_count == 0) return 0;
   const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
-  long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));
+  long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
+  if (max_tries < 8 && total_count > 8) { max_tries = 8;  }
   return max_tries;
 }
 
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
   long max_tries = mi_segment_get_reclaim_tries();
+  if (max_tries <= 0) return NULL;
+
+  mi_segment_t* segment;
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
     segment->abandoned_visits++;
-    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
-    // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
+    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
+    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
+    // Perhaps we can skip non-suitable ones in a better way?
     bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
     bool all_pages_free;
     bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees)
@@ -1088,7 +1091,7 @@ static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_p
   mi_assert_internal(page != NULL);
   #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
   // verify it is committed
-  mi_segment_page_start_ex(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0;
   #endif
   return page;
 }
@@ -1111,7 +1114,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  mi_segment_page_start_ex(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_raw_page_start(segment, page, NULL)[0] = 0;
 #endif
   return page;
 }
@@ -1132,9 +1135,9 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
-  uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
   page->block_size = psize;
-  
+
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
diff --git a/src/stats.c b/src/stats.c
index 5dfd713b..99cf89c5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -174,13 +174,28 @@ static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* ar
 
 static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
   _mi_fprintf(out, arg,"%10s:", msg);
-  if (unit > 0) {
-    mi_print_amount(stat->peak, unit, out, arg);
-    mi_print_amount(stat->allocated, unit, out, arg);
-    mi_print_amount(stat->freed, unit, out, arg);
-    mi_print_amount(stat->current, unit, out, arg);
-    mi_print_amount(unit, 1, out, arg);
-    mi_print_count(stat->allocated, unit, out, arg);
+  if (unit != 0) {
+    if (unit > 0) {
+      mi_print_amount(stat->peak, unit, out, arg);
+      mi_print_amount(stat->allocated, unit, out, arg);
+      mi_print_amount(stat->freed, unit, out, arg);
+      mi_print_amount(stat->current, unit, out, arg);
+      mi_print_amount(unit, 1, out, arg);
+      mi_print_count(stat->allocated, unit, out, arg);
+    }
+    else {
+      mi_print_amount(stat->peak, -1, out, arg);
+      mi_print_amount(stat->allocated, -1, out, arg);
+      mi_print_amount(stat->freed, -1, out, arg);
+      mi_print_amount(stat->current, -1, out, arg);
+      if (unit == -1) {
+        _mi_fprintf(out, arg, "%24s", "");
+      }
+      else {
+        mi_print_amount(-unit, 1, out, arg);
+        mi_print_count((stat->allocated / -unit), 0, out, arg);
+      }
+    }
     if (stat->allocated > stat->freed) {
       _mi_fprintf(out, arg, "  ");
       _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
@@ -190,23 +205,6 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64
       _mi_fprintf(out, arg, "  ok\n");
     }
   }
-  else if (unit<0) {
-    mi_print_amount(stat->peak, -1, out, arg);
-    mi_print_amount(stat->allocated, -1, out, arg);
-    mi_print_amount(stat->freed, -1, out, arg);
-    mi_print_amount(stat->current, -1, out, arg);
-    if (unit==-1) {
-      _mi_fprintf(out, arg, "%24s", "");
-    }
-    else {
-      mi_print_amount(-unit, 1, out, arg);
-      mi_print_count((stat->allocated / -unit), 0, out, arg);
-    }
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
-      _mi_fprintf(out, arg, "  ok\n");
-  }
   else {
     mi_print_amount(stat->peak, 1, out, arg);
     mi_print_amount(stat->allocated, 1, out, arg);

From 460278f1102bf74d49aa7de402fde1053377fbb7 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Mon, 25 Mar 2024 16:02:20 -0700
Subject: [PATCH 24/28] comments

---
 src/free.c    | 41 +++++++----------------------------------
 src/segment.c |  2 +-
 2 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/src/free.c b/src/free.c
index 87847f21..59c20aed 100644
--- a/src/free.c
+++ b/src/free.c
@@ -71,7 +71,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   MI_UNUSED(segment);
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true, true);
+  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
@@ -136,7 +136,7 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true, false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
@@ -167,11 +167,11 @@ bool _mi_free_delayed_block(mi_block_t* block) {
     return false;
   }
 
-  // collect all other non-local frees to ensure up-to-date `used` count
+  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
   _mi_page_free_collect(page, false);
 
-  // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true);
+  // and free the block (possibly freeing the page as well since `used` is updated)
+  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
   return true;
 }
 
@@ -225,10 +225,6 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
   }
 }
 
-#if MI_HUGE_PAGE_ABANDON
-static void mi_stat_huge_free(const mi_page_t* page);
-#endif
-
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
@@ -251,7 +247,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
 
-  // adjust stats (after padding check and potential recursive `mi_free` above)
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
   mi_stat_free(page, block);    // stat_free may access the padding
   mi_track_free_size(block, mi_page_usable_size_of(page,block));
 
@@ -261,7 +257,6 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
     // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
     _mi_segment_huge_page_free(segment, page, block);
     return;
     #else
@@ -510,35 +505,13 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
 #endif
   }
-#if !MI_HUGE_PAGE_ABANDON
   else {
-    const size_t bpsize = mi_page_block_size(page);
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
     mi_heap_stat_decrease(heap, huge, bpsize);
   }
-#endif
 }
 #else
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
diff --git a/src/segment.c b/src/segment.c
index ff20b504..91ff9adb 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1167,7 +1167,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
     mi_block_set_next(page, block, page->free);
     page->free = block;
     page->used--;
-    page->is_zero = false;
+    page->is_zero_init = false;
     mi_assert(page->used == 0);
     mi_tld_t* tld = heap->tld;
     mi_segments_track_size((long)segment->segment_size, &tld->segments);

From 1d8997236cb8e09103436491c8aa34548aa2cb24 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:28:15 -0700
Subject: [PATCH 25/28] add comment on concurrent access in ptr_unalign

---
 src/free.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/free.c b/src/free.c
index 59c20aed..bf5498a4 100644
--- a/src/free.c
+++ b/src/free.c
@@ -52,6 +52,9 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the 
+// `page_start` and `block_size` fields; however these are constant and the page won't be 
+// deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 

From a5228992366b7ff498635238e09aa6bf6fea642d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:32:19 -0700
Subject: [PATCH 26/28] fix compilation with ASAN

---
 src/free.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index bf5498a4..43e1f76f 100644
--- a/src/free.c
+++ b/src/free.c
@@ -38,7 +38,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-  if (track_stats) { mi_track_free_size(p, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
   
   // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);

From 6b4f3f6223d7bc3ec5f4119ef14ed45582c2033d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:37:46 -0700
Subject: [PATCH 27/28] further ASAN fix

---
 src/page.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/page.c b/src/page.c
index 7a333cb4..efcd8d91 100644
--- a/src/page.c
+++ b/src/page.c
@@ -673,8 +673,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->free_is_zero = page->is_zero_init;
   #if MI_DEBUG>2
   if (page->is_zero_init) {
-    mi_track_mem_defined(page_start, page_size);
-    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page->page_start, page_size));
+    mi_track_mem_defined(page->page_start, page_size);
+    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {

From 8f7d1e9a41bb0182166aac6a8d4d8b00f60ed032 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 29 Mar 2024 11:17:21 -0700
Subject: [PATCH 28/28] fix free in realpath when using ASAN

---
 src/alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/alloc.c b/src/alloc.c
index 2e03eca0..32175b0c 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -396,7 +396,8 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name)
     char* rname = realpath(fname, NULL);
     if (rname == NULL) return NULL;
     char* result = mi_heap_strdup(heap, rname);
-    free(rname);  // use regular free! (which may be redirected to our free but that's ok)
+    mi_cfree(rname);  // use checked free (which may be redirected to our free but that's ok)
+    // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-(
     return result;
   }
   /*