From 1325ee640aa429ff4db080458895c8864e406a95 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 30 Jun 2023 12:16:38 -0700
Subject: [PATCH 001/119] avoid warning on newer clang

---
 include/mimalloc/atomic.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index fe418fab..ab407c61 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -23,8 +23,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #define  _Atomic(tp)            std::atomic<tp>
 #define  mi_atomic(name)        std::atomic_##name
 #define  mi_memory_order(name)  std::memory_order_##name
-#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
- #define MI_ATOMIC_VAR_INIT(x)  x
+#if (__cplusplus >= 202002L)    // c++20, see issue #571
+#define MI_ATOMIC_VAR_INIT(x)  x
+#elif !defined(ATOMIC_VAR_INIT)
+#define MI_ATOMIC_VAR_INIT(x)  x
 #else
  #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
 #endif

From 36ee5f9024af87172fcb0fb8b20f8062fa3463d2 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 30 Jun 2023 12:21:01 -0700
Subject: [PATCH 002/119] avoid warning on newer clang

---
 include/mimalloc/atomic.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ab407c61..f4bde7f4 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -41,7 +41,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdatomic.h>
 #define  mi_atomic(name)        atomic_##name
 #define  mi_memory_order(name)  memory_order_##name
-#if !defined(ATOMIC_VAR_INIT) || (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+#if (__STDC_VERSION__ >= 201710L) // c17, see issue #735
+ #define MI_ATOMIC_VAR_INIT(x) x
+#elif !defined(ATOMIC_VAR_INIT)
  #define MI_ATOMIC_VAR_INIT(x) x
 #else
  #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x)

From 388d1aa9bd7d25e1add460b72b4a1535cb0323d4 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Fri, 14 Jul 2023 21:55:30 +0100
Subject: [PATCH 003/119] new Linux/Android option proposal.

Allowing to disable transparent huge pages on Linux/Android, so
we avoid affecting the whole system (such as /sys/kernel/mm/transparent_hugepage/enabled).
---
 CMakeLists.txt       |  9 +++++++++
 src/prim/unix/prim.c | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1387e0db..0fe0b9e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clan
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
+option(MI_NO_THP            "Force disable transparent huge pages support on Linux/Android process wise only" OFF)
 
 # deprecated options
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
@@ -263,6 +264,14 @@ if(MI_USE_CXX)
   endif()
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
+  if(MI_NO_THP)
+    message(STATUS "Disable transparent huge pages support (MI_NO_THP=ON)")
+    list(APPEND mi_defines MI_NO_THP=1)
+  endif()
+endif()
+
+
 if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
    SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
    SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 54bf57b2..14a0dcdb 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -31,6 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #if defined(__linux__)
   #include <features.h>
   #include <fcntl.h>
+  #include <sys/prctl.h>
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
@@ -125,6 +126,20 @@ static bool unix_detect_overcommit(void) {
   return os_overcommit;
 }
 
+void unix_set_thp(void) {
+#if defined(__linux__) || defined(__ANDROID__)
+#if MI_NO_THP
+  int val;
+  if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+    // Most likely since distros often come with always/madvise settings.
+    val = 1;
+    // Disabling only for mimalloc process rather than touching system wide settings
+    (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+  }
+#endif
+#endif
+}
+
 void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   long psize = sysconf(_SC_PAGESIZE);
   if (psize > 0) {
@@ -135,6 +150,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->has_overcommit = unix_detect_overcommit();
   config->must_free_whole = false;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
+  unix_set_thp();
 }
 
 

From 7020ed5e5230b8cdab4dea033dd472841c6e65f7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 11:26:03 -0800
Subject: [PATCH 004/119] do not purge if purge delay is negative

---
 src/arena.c   | 96 +++++++++++++++++++++++++--------------------------
 src/segment.c | 32 ++++++++---------
 2 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 0f71e978..42aac8fc 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -13,7 +13,7 @@ threads and need to be accessed using atomic operations.
 
 Arenas are used to for huge OS page (1GiB) reservations or for reserving
 OS memory upfront which can be improve performance or is sometimes needed
-on embedded devices. We can also employ this with WASI or `sbrk` systems 
+on embedded devices. We can also employ this with WASI or `sbrk` systems
 to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
@@ -48,13 +48,13 @@ typedef struct mi_arena_s {
   size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
   mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
   int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena  
+  bool     exclusive;                     // only allow allocations if specifically for this arena
   bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
   _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)  
+  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
@@ -94,13 +94,13 @@ bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_i
     return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
   }
   else {
-    return mi_arena_id_is_suitable(0, false, request_arena_id);
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
   }
 }
 
 
 /* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the 
+  Arena allocations get a (currently) 16-bit memory id where the
   lower 8 bits are the arena id, and the upper bits the block index.
 ----------------------------------------------------------- */
 
@@ -208,7 +208,7 @@ static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around    
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
   return false;
@@ -228,7 +228,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   mi_bitmap_index_t bitmap_index;
   if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
 
-  // claimed it! 
+  // claimed it!
   void* p = mi_arena_block_start(arena, bitmap_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
   memid->is_pinned = arena->memid.is_pinned;
@@ -268,21 +268,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     // no need to commit, but check if already fully committed
     memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
-  
+
   return p;
 }
 
 // allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, 
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) 
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
   MI_UNUSED_RELEASE(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);  
+  const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
   mi_assert_internal(size <= mi_arena_block_size(bcount));
-  
+
   // Check arena suitability
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
@@ -302,7 +302,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
 
 
 // allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, 
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
                                                   bool commit, bool allow_large,
                                                   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
@@ -310,9 +310,9 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if mi_likely(max_arena == 0) return NULL;
-  
+
   if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested 
+    // try a specific arena if requested
     if (mi_arena_id_index(req_arena_id) < max_arena) {
       void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
@@ -320,7 +320,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   }
   else {
     // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {    
+    for (size_t i = 0; i < max_arena; i++) {
       void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
@@ -348,22 +348,22 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
   if (arena_reserve == 0) return false;
 
-  if (!_mi_os_has_virtual_reserve()) { 
+  if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for some embedded systems for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
     arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
-  }    
+  }
   if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-      
+
   // commit eagerly?
   bool arena_commit = false;
   if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
   return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0);
-}    
+}
 
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
@@ -378,9 +378,9 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
     void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-    if (p != NULL) return p;    
+    if (p != NULL) return p;
 
-    // otherwise, try to first eagerly reserve a new arena 
+    // otherwise, try to first eagerly reserve a new arena
     if (req_arena_id == _mi_arena_id_none()) {
       mi_arena_id_t arena_id = 0;
       if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
@@ -397,14 +397,14 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
     errno = ENOMEM;
     return NULL;
   }
-   
+
   // finally, fall back to the OS
   if (align_offset > 0) {
     return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
   }
   else {
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }  
+  }
 }
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
@@ -440,22 +440,22 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
   const size_t size = mi_arena_block_size(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx); 
+  void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
     // all blocks are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
   else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed 
+    // some blocks are not committed -- this can happen when a partially committed block is freed
     // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), 
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
     needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
     _mi_stat_increase(&stats->committed, size);
   }
-  
+
   // clear the purged blocks
   _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
   // update committed bitmap
@@ -473,7 +473,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);    
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);
   }
   else {
     // schedule decommit
@@ -515,7 +515,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
 }
 
 // returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) 
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
 {
   if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
@@ -524,10 +524,10 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
-  
+
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
-  bool full_purge = true;  
+  bool full_purge = true;
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
@@ -578,7 +578,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
 
   // allow only one thread to purge at a time
   static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard) 
+  mi_atomic_guard(&purge_guard)
   {
     mi_msecs_t now = _mi_clock_now();
     size_t max_purge_count = (visit_all ? max_arena : 1);
@@ -591,7 +591,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
         }
       }
     }
-  }  
+  }
 }
 
 
@@ -605,7 +605,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
-  
+
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
     if (!all_committed && committed_size > 0) {
@@ -623,7 +623,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     const size_t blocks = mi_block_count_of_size(size);
-    
+
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -645,7 +645,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
       mi_assert_internal(arena->blocks_purge != NULL);
-      
+
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
         _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
@@ -660,9 +660,9 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);      
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
     }
-    
+
     // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
@@ -687,9 +687,9 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {      
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); 
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
       }
       else {
         new_max_arena = i;
@@ -712,7 +712,7 @@ void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
   mi_arenas_unsafe_destroy();
-  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas  
+  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
@@ -720,8 +720,8 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { 
-      return true;      
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+      return true;
     }
   }
   return false;
@@ -765,7 +765,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_memid_t meta_memid;
   mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
-  
+
   // already zero'd due to os_alloc
   // _mi_memzero(arena, asize);
   arena->id = _mi_arena_id_none();
@@ -782,12 +782,12 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->search_idx   = 0;
   arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
   arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap  
+  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap
   // initialize committed bitmap?
   if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
     memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
   }
-  
+
   // and claim leftover blocks if needed (so we never allocate there)
   ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   mi_assert_internal(post >= 0);
diff --git a/src/segment.c b/src/segment.c
index 6798bb66..26862899 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -239,7 +239,7 @@ static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tl
   mi_assert_internal(page->used == 0);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   size_t psize;
-  void* start = mi_segment_raw_page_start(segment, page, &psize);  
+  void* start = mi_segment_raw_page_start(segment, page, &psize);
   const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
   if (needs_recommit) { page->is_committed = false; }
   page->used = 0;
@@ -249,7 +249,7 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   if (page->is_committed) return true;
   mi_assert_internal(segment->allow_decommit);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-  
+
   size_t psize;
   uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
   bool is_zero = false;
@@ -259,8 +259,8 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   page->is_committed = true;
   page->used = 0;
   page->is_zero_init = is_zero;
-  if (gsize > 0) { 
-    mi_segment_protect_range(start + psize, gsize, true); 
+  if (gsize > 0) {
+    mi_segment_protect_range(start + psize, gsize, true);
   }
   return true;
 }
@@ -296,7 +296,7 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi
     // purge immediately?
     mi_page_purge(segment, page, tld);
   }
-  else {
+  else if (mi_option_get(mi_option_purge_delay) > 0) {   // no purging if the delay is negative
     // otherwise push on the delayed page reset queue
     mi_page_queue_t* pq = &tld->pages_purge;
     // push on top
@@ -484,11 +484,11 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
     if (page->is_committed)  { committed_size += page_size;  }
-    if (!page->is_committed) { fully_committed = false; }    
+    if (!page->is_committed) { fully_committed = false; }
   }
   MI_UNUSED(fully_committed);
   mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-    
+
   _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
@@ -536,9 +536,9 @@ static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignme
       // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
       _mi_arena_free(segment, segment_size, 0, memid, tld_os->stats);
       return NULL;
-    }    
+    }
   }
-  
+
   MI_UNUSED(info_size);
   segment->memid = memid;
   segment->allow_decommit = !memid.is_pinned;
@@ -581,7 +581,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
                               tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
   const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE);
-  
+
   // Allocate the segment from the OS (segment_size can change due to alignment)
   mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld, os_tld);
   if (segment == NULL) return NULL;
@@ -609,7 +609,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   segment->segment_info_size = pre_size;
   segment->thread_id  = _mi_thread_id();
   segment->cookie = _mi_ptr_cookie(segment);
-  
+
   // set protection
   mi_segment_protect(segment, true, tld->os);
 
@@ -628,7 +628,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   // don't purge as we are freeing now
   mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld);
   mi_segment_remove_from_free_queue(segment, tld);
-  
+
   mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
   mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
   mi_assert(segment->next == NULL);
@@ -655,10 +655,10 @@ static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 
   // check commit
   if (!mi_page_ensure_committed(segment, page, tld)) return false;
-  
+
   // set in-use before doing unreset to prevent delayed reset
   page->segment_in_use = true;
-  segment->used++;  
+  segment->used++;
   mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld));
   mi_assert_internal(segment->used <= segment->capacity);
   if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) {
@@ -1134,7 +1134,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
 
 static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));  
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
   for (size_t i = 0; i < segment->capacity; i++) {  // TODO: use a bitmap instead of search?
     mi_page_t* page = &segment->pages[i];
     if (!page->segment_in_use) {
@@ -1274,7 +1274,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
   mi_assert_internal(page->free == NULL);
   if (segment->allow_decommit && page->is_committed) {
     size_t usize = mi_usable_size(block);
-    if (usize > sizeof(mi_block_t)) { 
+    if (usize > sizeof(mi_block_t)) {
       usize = usize - sizeof(mi_block_t);
       uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
       _mi_os_reset(p, usize, &_mi_stats_main);

From 8f353d8005c919d609e95003527c826c8c4e8310 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 12:03:28 -0800
Subject: [PATCH 005/119] set initially_zero for arena_static_zalloc

---
 src/arena.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/arena.c b/src/arena.c
index 42aac8fc..790b6ac1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -162,6 +162,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
 
   // success
   *memid = _mi_memid_create(MI_MEM_STATIC);
+  memid->initially_zero = true;
   const size_t start = _mi_align_up(oldtop, alignment);
   uint8_t* const p = &mi_arena_static[start];
   _mi_memzero(p, size);
@@ -179,8 +180,10 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   p = _mi_os_alloc(size, memid, stats);
   if (p == NULL) return NULL;
 
+  // zero the OS memory if needed
   if (!memid->initially_zero) {
     _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
   }
   return p;
 }

From bdda13b8800859558b95b937a747eda8c31a3b4e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 14:27:28 -0800
Subject: [PATCH 006/119] track abandoned segments in an arena bitmap instead
 of with a list

---
 include/mimalloc/internal.h |   4 +
 include/mimalloc/types.h    |   1 -
 src/arena.c                 |  96 +++++++++++++++++-
 src/segment.c               | 197 +++++-------------------------------
 4 files changed, 122 insertions(+), 176 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 40401736..94ac04f5 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -124,6 +124,10 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
+bool          _mi_arena_segment_clear_abandoned(mi_memid_t memid);
+void          _mi_arena_segment_mark_abandoned(mi_memid_t memid);
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* current_id, size_t* current_idx);
+
 // "segment-map.c"
 void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
 void       _mi_segment_map_freed_at(const mi_segment_t* segment);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 06b96587..0a72af52 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -373,7 +373,6 @@ typedef struct mi_segment_s {
   size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
   
   // segment fields
-  _Atomic(struct mi_segment_s*) abandoned_next;
   struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
   struct mi_segment_s* prev;
 
diff --git a/src/arena.c b/src/arena.c
index 42aac8fc..72898175 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -55,6 +55,7 @@ typedef struct mi_arena_s {
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
   mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
@@ -727,6 +728,89 @@ bool _mi_arena_contains(const void* p) {
   return false;
 }
 
+/* -----------------------------------------------------------
+  Abandoned blocks/segments.
+  This is used to atomically abandon/reclaim segments 
+  (and crosses the arena API but it is convenient to have here).
+  Abandoned segments still have live blocks; they get reclaimed
+  when a thread frees in it, or when a thread needs a fresh
+  segment; these threads scan the abandoned segments through
+  the arena bitmaps.
+----------------------------------------------------------- */
+
+// reclaim a specific abandoned segment; `true` on success.
+bool _mi_arena_segment_clear_abandoned(mi_memid_t memid ) 
+{
+  if (memid.memkind != MI_MEM_ARENA) return true;  // not in an arena, consider it un-abandoned
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+  mi_assert_internal(arena != NULL);
+  bool was_abandoned = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
+  mi_assert_internal(was_abandoned);
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+  return was_abandoned;
+}
+
+// mark a specific segment as abandoned
+void _mi_arena_segment_mark_abandoned(mi_memid_t memid) 
+{
+  if (memid.memkind != MI_MEM_ARENA) return;  // not in an arena
+  size_t arena_idx;
+  size_t bitmap_idx;
+  mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+  mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+  mi_assert_internal(arena != NULL);
+  const bool was_unset = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+  mi_assert_internal(was_unset);
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+}
+
+// reclaim abandoned segments 
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id, size_t* previous_idx ) 
+{
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  int arena_idx = *previous_id;
+  size_t field_idx = mi_bitmap_index_field(*previous_idx);
+  size_t bit_idx = mi_bitmap_index_bit_in_field(*previous_idx) + 1;
+  // visit arena's (from previous)
+  for( ; arena_idx < max_arena; arena_idx++, field_idx = 0, bit_idx = 0) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+    if (arena != NULL) {
+      // visit the abandoned fields (starting at previous_idx)
+      for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
+        mi_bitmap_field_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
+        if mi_unlikely(field != 0) { // skip zero fields quickly
+          // visit each set bit in the field  (todo: maybe use `ctz` here?)
+          for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
+            // pre-check if the bit is set
+            mi_bitmap_field_t mask = ((mi_bitmap_field_t)1 << bit_idx);
+            if mi_unlikely((field & mask) == mask) {
+              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              // try to reclaim it atomically
+              if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
+                *previous_idx = bitmap_idx;
+                *previous_id = arena_idx;
+                mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                return (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // no more found
+  *previous_idx = 0;
+  *previous_id = 0;
+  return NULL;
+}
+
 
 /* -----------------------------------------------------------
   Add an arena.
@@ -760,13 +844,13 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
 
   const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
   const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  const size_t bitmaps = (memid.is_pinned ? 2 : 4);
+  const size_t bitmaps = (memid.is_pinned ? 3 : 5);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
   mi_memid_t meta_memid;
   mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
 
-  // already zero'd due to os_alloc
+  // already zero'd due to zalloc
   // _mi_memzero(arena, asize);
   arena->id = _mi_arena_id_none();
   arena->memid = memid;
@@ -780,9 +864,11 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   arena->search_idx   = 0;
-  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap
+  // consequetive bitmaps
+  arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
+  arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
+  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandonde bitmap
+  arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
   // initialize committed bitmap?
   if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
     memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
diff --git a/src/segment.c b/src/segment.c
index 26862899..25ab9193 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -587,9 +587,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   if (segment == NULL) return NULL;
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
   mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
-
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
-
+  
   // zero the segment info (but not the `mem` fields)
   ptrdiff_t ofs = offsetof(mi_segment_t, next);
   _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
@@ -743,171 +741,25 @@ Abandonment
 When threads terminate, they can leave segments with
 live blocks (reached through other threads). Such segments
 are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually
+reuse their pages and/or free them eventually. The 
+`thread_id` of such segments is 0.
 
-We maintain a global list of abandoned segments that are
-reclaimed on demand. Since this is shared among threads
-the implementation needs to avoid the A-B-A problem on
-popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
-We use tagged pointers to avoid accidentally identifying
-reused segments, much like stamped references in Java.
-Secondly, we maintain a reader counter to avoid resetting
-or decommitting segments that have a pending read operation.
+When a block is freed in an abandoned segment, the segment
+is reclaimed into that thread. 
 
-Note: the current implementation is one possible design;
-another way might be to keep track of abandoned segments
-in the regions. This would have the advantage of keeping
-all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to
-scan abandoned segments efficiently in that case as they
-would be spread among all other segments in the regions.
+Moreover, if threads are looking for a fresh segment, they
+will first consider abondoned segments -- these can be found
+by scanning the arena memory 
+(segments outside arena memoryare only reclaimed by a free). 
 ----------------------------------------------------------- */
 
-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
-// to put in a tag that increments on update to avoid the A-B-A problem.
-#define MI_TAGGED_MASK   MI_SEGMENT_MASK
-typedef uintptr_t        mi_tagged_segment_t;
+// Maintain these for debug purposes
+static mi_decl_cache_align _Atomic(size_t)abandoned_count;
 
-static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
-  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
-}
 
-static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
-  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
-  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
-  return ((uintptr_t)segment | tag);
-}
-
-// This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of
-// this list reduces contention and the rate at which segments are visited.
-static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
-
-// The abandoned page list (tagged as it supports pop)
-static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
-
-// Maintain these for debug purposes (these counts may be a bit off)
-static mi_decl_cache_align _Atomic(size_t)           abandoned_count;
-static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
-
-// We also maintain a count of current readers of the abandoned list
-// in order to prevent resetting/decommitting segment memory if it might
-// still be read.
-static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
-
-// Push on the visited list
-static void mi_abandoned_visited_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
-  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
-  mi_atomic_increment_relaxed(&abandoned_visited_count);
-}
-
-// Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void)
-{
-  // quick check if the visited list is empty
-  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
-
-  // grab the whole visited list
-  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
-  if (first == NULL) return false;
-
-  // first try to swap directly if the abandoned list happens to be NULL
-  mi_tagged_segment_t afirst;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  if (mi_tagged_segment_ptr(ts)==NULL) {
-    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
-      mi_atomic_add_relaxed(&abandoned_count, count);
-      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-      return true;
-    }
-  }
-
-  // find the last element of the visited list: O(n)
-  mi_segment_t* last = first;
-  mi_segment_t* next;
-  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
-    last = next;
-  }
-
-  // and atomically prepend to the abandoned list
-  // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
-  size_t count;
-  do {
-    count = mi_atomic_load_relaxed(&abandoned_visited_count);
-    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
-    afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
-  mi_atomic_add_relaxed(&abandoned_count, count);
-  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
-  return true;
-}
-
-// Push on the abandoned list.
-static void mi_abandoned_push(mi_segment_t* segment) {
-  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
-  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-  mi_assert_internal(segment->used > 0);
-  mi_tagged_segment_t next;
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  do {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
-    next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
-  mi_atomic_increment_relaxed(&abandoned_count);
-}
-
-// Wait until there are no more pending reads on segments that used to be in the abandoned list
+// legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
-  size_t n;
-  do {
-    n = mi_atomic_load_acquire(&abandoned_readers);
-    if (n != 0) mi_atomic_yield();
-  } while (n != 0);
-}
-
-// Pop from the abandoned list
-static mi_segment_t* mi_abandoned_pop(void) {
-  mi_segment_t* segment;
-  // Check efficiently if it is empty (or if the visited list needs to be moved)
-  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
-  segment = mi_tagged_segment_ptr(ts);
-  if mi_likely(segment == NULL) {
-    if mi_likely(!mi_abandoned_visited_revisit()) { // try to swap in the visited list on NULL
-      return NULL;
-    }
-  }
-
-  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending,
-  // and a tagged pointer to prevent A-B-A link corruption.
-  // (this is called from `region.c:_mi_mem_free` for example)
-  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
-  mi_tagged_segment_t next = 0;
-  ts = mi_atomic_load_acquire(&abandoned);
-  do {
-    segment = mi_tagged_segment_ptr(ts);
-    if (segment != NULL) {
-      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
-      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
-    }
-  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
-  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
-  if (segment != NULL) {
-    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-    mi_atomic_decrement_relaxed(&abandoned_count);
-  }
-  return segment;
+  // nothing needed 
 }
 
 /* -----------------------------------------------------------
@@ -917,7 +769,6 @@ static mi_segment_t* mi_abandoned_pop(void) {
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
   mi_assert_internal(segment->used > 0);
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
   // remove the segment from the free page queue if needed
@@ -931,8 +782,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_segments_track_size(-((long)segment->segment_size), tld);
   segment->thread_id = 0;
   segment->abandoned_visits = 0;
-  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
-  mi_abandoned_push(segment);
+  _mi_arena_segment_mark_abandoned(segment->memid); mi_atomic_increment_relaxed(&abandoned_count);
 }
 
 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
@@ -995,7 +845,6 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
 
   segment->thread_id = _mi_thread_id();
@@ -1056,7 +905,10 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
-  while ((segment = mi_abandoned_pop()) != NULL) {
+  mi_arena_id_t current_id = 0;
+  size_t        current_idx = 0;
+  while ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL) {
+    mi_atomic_decrement_relaxed(&abandoned_count);
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
@@ -1065,8 +917,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 {
   *reclaimed = false;
   mi_segment_t* segment;
-  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times
-  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+  mi_arena_id_t current_id = 0;
+  size_t        current_idx = 0;
+  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 1024);     // limit the work to bound allocation times
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL)) 
+  {
+    mi_atomic_decrement_relaxed(&abandoned_count);
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
     // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
@@ -1092,9 +948,10 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
       mi_segment_reclaim(segment, heap, 0, NULL, tld);
     }
     else {
-      // otherwise, push on the visited list so it gets not looked at too quickly again
+      // otherwise, mark it back as abandoned
       // todo: reset delayed pages in the segment?
-      mi_abandoned_visited_push(segment);
+      mi_atomic_increment_relaxed(&abandoned_count);
+      _mi_arena_segment_mark_abandoned(segment->memid);
     }
   }
   return NULL;

From 64edbc92dd081a81f9166d688c0ae81c4231a697 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 14:58:59 -0800
Subject: [PATCH 007/119] allow abandoned segment reclaim on a free

---
 include/mimalloc.h          |  1 +
 include/mimalloc/internal.h |  1 +
 src/alloc.c                 | 19 +++++++++++++++----
 src/arena.c                 |  7 ++++---
 src/options.c               |  5 +++--
 src/segment.c               | 12 +++++++++++-
 6 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 368c22cc..7b1b7d5a 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -342,6 +342,7 @@ typedef enum mi_option_e {
   mi_option_arena_reserve,            // initial memory size in KiB for arena reservation (1GiB on 64-bit)
   mi_option_arena_purge_mult,         
   mi_option_purge_extend_delay,
+  mi_option_abandoned_reclaim_on_free,  // reclaim abandoned segments on a free
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 94ac04f5..1143a184 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,6 +147,7 @@ void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, m
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
+bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
diff --git a/src/alloc.c b/src/alloc.c
index b17fdbdc..738985aa 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -406,12 +406,24 @@ static void mi_stat_huge_free(const mi_page_t* page) {
 // multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
+  // first see if the segment was abandoned and we can reclaim it
+  mi_segment_t* const segment = _mi_page_segment(page);
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
+      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_prim_get_default_heap(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+  
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
   _mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
 
-  mi_segment_t* const segment = _mi_page_segment(page);
   if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
     // huge page segments are always abandoned and can be freed immediately
@@ -424,9 +436,8 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
     // (as the owning thread needs to actually free the memory later).
     _mi_segment_huge_page_reset(segment, page, block);
     #endif
-  }
-
-
+  }  
+  
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
   memset(block, MI_DEBUG_FREED, mi_usable_size(block));
   #endif
diff --git a/src/arena.c b/src/arena.c
index 72898175..9ce8c3f7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -749,9 +749,9 @@ bool _mi_arena_segment_clear_abandoned(mi_memid_t memid )
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
   bool was_abandoned = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  mi_assert_internal(was_abandoned);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+  // mi_assert_internal(was_abandoned);
+  mi_assert_internal(!was_abandoned || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
   return was_abandoned;
 }
 
@@ -766,6 +766,7 @@ void _mi_arena_segment_mark_abandoned(mi_memid_t memid)
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
   const bool was_unset = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+  MI_UNUSED_RELEASE(was_unset);
   mi_assert_internal(was_unset);
   mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
 }
diff --git a/src/options.c b/src/options.c
index 2a83fcc7..61ed5be7 100644
--- a/src/options.c
+++ b/src/options.c
@@ -81,7 +81,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
   { 16,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 8,   UNINIT, MI_OPTION(max_segment_reclaim)},       // max. number of segment reclaims from the abandoned segments per try.
+  { 16,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. number of segment reclaims from the abandoned segments per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
   #if (MI_INTPTR_SIZE>4)
   { 1024L * 1024L, UNINIT, MI_OPTION(arena_reserve) },  // reserve memory N KiB at a time
@@ -89,8 +89,9 @@ static mi_option_desc_t options[_mi_option_last] =
   {  128L * 1024L, UNINIT, MI_OPTION(arena_reserve) },
   #endif
 
-  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },        // purge delay multiplier for arena's
+  { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/segment.c b/src/segment.c
index 25ab9193..3f325dcf 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -336,7 +336,7 @@ static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge
     mi_page_t* page = &segment->pages[i];
     if (!page->segment_in_use) {
       mi_page_purge_remove(page, tld);
-      if (force_purge) {
+      if (force_purge && page->is_committed) {
         mi_page_purge(segment, page, tld);
       }
     }
@@ -902,6 +902,16 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   }
 }
 
+// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
+bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
+  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
+  if (_mi_arena_segment_clear_abandoned(segment->memid)) {  // atomically unabandon
+    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
+    mi_assert_internal(res != NULL);
+    return (res != NULL);
+  }
+  return false;
+}
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;

From d34d8f0f657bc6bdda5e8bc5ba530719ec410d43 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 15:39:47 -0800
Subject: [PATCH 008/119] fix signed comparison

---
 src/arena.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index 9ce8c3f7..cd3a7ef2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -774,7 +774,7 @@ void _mi_arena_segment_mark_abandoned(mi_memid_t memid)
 // reclaim abandoned segments 
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id, size_t* previous_idx ) 
 {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
   int arena_idx = *previous_id;
   size_t field_idx = mi_bitmap_index_field(*previous_idx);
   size_t bit_idx = mi_bitmap_index_bit_in_field(*previous_idx) + 1;

From 9f9305d44bb3713abc9db586b051ce537d5b4d76 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 17:43:52 -0800
Subject: [PATCH 009/119] use non-primitive heap default

---
 src/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alloc.c b/src/alloc.c
index 738985aa..7135a857 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -412,7 +412,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_atomic_load_relaxed(&segment->thread_id) == 0) 
   {
     // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_prim_get_default_heap(), segment)) {
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
       mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;

From 931d523dccc7f276e0a665efd7168ff2c72b579a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 18:17:58 -0800
Subject: [PATCH 010/119] update mstress to let the main thread participate

---
 src/alloc.c        |  5 ++++-
 src/segment.c      |  4 ++--
 test/test-stress.c | 12 +++++++-----
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index 7135a857..e2273d28 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -408,7 +408,10 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
 {
   // first see if the segment was abandoned and we can reclaim it
   mi_segment_t* const segment = _mi_page_segment(page);
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE && 
+      #endif
       mi_atomic_load_relaxed(&segment->thread_id) == 0) 
   {
     // the segment is abandoned, try to reclaim it into our heap
diff --git a/src/segment.c b/src/segment.c
index 3f325dcf..3dd37429 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -867,7 +867,6 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       // set the heap again and allow heap thread delayed free again.
       mi_page_set_heap(page, heap);
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
-      // TODO: should we not collect again given that we just collected in `check_free`?
       _mi_page_free_collect(page, false); // ensure used count is up to date
       if (mi_page_all_free(page)) {
         // if everything free already, clear the page directly
@@ -906,8 +905,9 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
   if (_mi_arena_segment_clear_abandoned(segment->memid)) {  // atomically unabandon
+    mi_atomic_decrement_relaxed(&abandoned_count);
     mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
-    mi_assert_internal(res != NULL);
+    mi_assert_internal(res == segment);
     return (res != NULL);
   }
   return false;
diff --git a/test/test-stress.c b/test/test-stress.c
index 3ecb67bd..8c81c009 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -291,13 +291,14 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
   thread_entry_fun = fun;
   DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
   HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
-  for (uintptr_t i = 0; i < nthreads; i++) {
+  for (uintptr_t i = 1; i < nthreads; i++) {
     thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
   }
-  for (size_t i = 0; i < nthreads; i++) {
+  fun(0); // run the main thread as well
+  for (size_t i = 1; i < nthreads; i++) {
     WaitForSingleObject(thandles[i], INFINITE);
   }
-  for (size_t i = 0; i < nthreads; i++) {
+  for (size_t i = 1; i < nthreads; i++) {
     CloseHandle(thandles[i]);
   }
   custom_free(tids);
@@ -325,10 +326,11 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
   pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t));
   memset(threads, 0, sizeof(pthread_t) * nthreads);
   //pthread_setconcurrency(nthreads);
-  for (size_t i = 0; i < nthreads; i++) {
+  for (size_t i = 1; i < nthreads; i++) {
     pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
   }
-  for (size_t i = 0; i < nthreads; i++) {
+  fun(0); // run the main thread as well
+  for (size_t i = 1; i < nthreads; i++) {
     pthread_join(threads[i], NULL);
   }
   custom_free(threads);

From 1b3eb8ef2863b810e99236ec9bf1eb558c67b1f1 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 19:17:24 -0800
Subject: [PATCH 011/119] quick exit from try_reclaim if no abandoned segments

---
 src/segment.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/segment.c b/src/segment.c
index 3dd37429..08909eb1 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -926,6 +926,8 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
+  if (mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
+
   mi_segment_t* segment;
   mi_arena_id_t current_id = 0;
   size_t        current_idx = 0;

From 71bcf1c76bd48f6bd3237944e1fd242466228e3e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 10:31:58 -0800
Subject: [PATCH 012/119] maintain abandoned_count more robustly

---
 src/arena.c   | 21 ++++++++++++++-------
 src/segment.c | 13 ++-----------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index cd3a7ef2..59db0b8e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -738,6 +738,9 @@ bool _mi_arena_contains(const void* p) {
   the arena bitmaps.
 ----------------------------------------------------------- */
 
+// Maintain these for debug purposes
+static mi_decl_cache_align _Atomic(size_t)abandoned_count;
+
 // reclaim a specific abandoned segment; `true` on success.
 bool _mi_arena_segment_clear_abandoned(mi_memid_t memid ) 
 {
@@ -748,11 +751,12 @@ bool _mi_arena_segment_clear_abandoned(mi_memid_t memid )
   mi_assert_internal(arena_idx < MI_MAX_ARENAS);
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
-  bool was_abandoned = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  // mi_assert_internal(was_abandoned);
-  mi_assert_internal(!was_abandoned || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
+  if (was_marked) { mi_atomic_decrement_relaxed(&abandoned_count); }
+  // mi_assert_internal(was_marked);
+  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
   //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_abandoned;
+  return was_marked;
 }
 
 // mark a specific segment as abandoned
@@ -765,15 +769,17 @@ void _mi_arena_segment_mark_abandoned(mi_memid_t memid)
   mi_assert_internal(arena_idx < MI_MAX_ARENAS);
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
-  const bool was_unset = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  MI_UNUSED_RELEASE(was_unset);
-  mi_assert_internal(was_unset);
+  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+  if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); }
+  mi_assert_internal(was_unmarked);
   mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
 }
 
 // reclaim abandoned segments 
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id, size_t* previous_idx ) 
 {
+  if (mi_atomic_load_relaxed(&abandoned_count) == 0) return false;
+
   const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
   int arena_idx = *previous_id;
   size_t field_idx = mi_bitmap_index_field(*previous_idx);
@@ -794,6 +800,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id,
               mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
               // try to reclaim it atomically
               if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
+                mi_atomic_decrement_relaxed(&abandoned_count);
                 *previous_idx = bitmap_idx;
                 *previous_id = arena_idx;
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
diff --git a/src/segment.c b/src/segment.c
index 08909eb1..36bf2dfc 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -753,10 +753,6 @@ by scanning the arena memory
 (segments outside arena memoryare only reclaimed by a free). 
 ----------------------------------------------------------- */
 
-// Maintain these for debug purposes
-static mi_decl_cache_align _Atomic(size_t)abandoned_count;
-
-
 // legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
   // nothing needed 
@@ -782,7 +778,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_segments_track_size(-((long)segment->segment_size), tld);
   segment->thread_id = 0;
   segment->abandoned_visits = 0;
-  _mi_arena_segment_mark_abandoned(segment->memid); mi_atomic_increment_relaxed(&abandoned_count);
+  _mi_arena_segment_mark_abandoned(segment->memid);
 }
 
 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
@@ -905,7 +901,6 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
   if (_mi_arena_segment_clear_abandoned(segment->memid)) {  // atomically unabandon
-    mi_atomic_decrement_relaxed(&abandoned_count);
     mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
     mi_assert_internal(res == segment);
     return (res != NULL);
@@ -918,7 +913,6 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_arena_id_t current_id = 0;
   size_t        current_idx = 0;
   while ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL) {
-    mi_atomic_decrement_relaxed(&abandoned_count);
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
@@ -926,15 +920,13 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
-  if (mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
-
+  
   mi_segment_t* segment;
   mi_arena_id_t current_id = 0;
   size_t        current_idx = 0;
   long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 1024);     // limit the work to bound allocation times
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL)) 
   {
-    mi_atomic_decrement_relaxed(&abandoned_count);
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
     // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
@@ -962,7 +954,6 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
     else {
       // otherwise, mark it back as abandoned
       // todo: reset delayed pages in the segment?
-      mi_atomic_increment_relaxed(&abandoned_count);
       _mi_arena_segment_mark_abandoned(segment->memid);
     }
   }

From cf8f73098e3e76968dd45dffd909f1f36e4c26a8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 10:51:18 -0800
Subject: [PATCH 013/119] start abandoned search randomized

---
 include/mimalloc/internal.h | 13 ++++++++++---
 src/arena.c                 | 34 ++++++++++++++++++++++------------
 src/segment.c               | 13 +++++--------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 1143a184..0976fc4e 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -124,9 +124,16 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
-bool          _mi_arena_segment_clear_abandoned(mi_memid_t memid);
-void          _mi_arena_segment_mark_abandoned(mi_memid_t memid);
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* current_id, size_t* current_idx);
+bool       _mi_arena_segment_clear_abandoned(mi_memid_t memid);
+void       _mi_arena_segment_mark_abandoned(mi_memid_t memid);
+
+typedef struct mi_arena_field_cursor_s { // abstract
+  mi_arena_id_t  start;   
+  int            count;   
+  size_t         bitmap_idx;
+} mi_arena_field_cursor_t;
+void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
 
 // "segment-map.c"
 void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
diff --git a/src/arena.c b/src/arena.c
index 59db0b8e..a73c5f52 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -775,17 +775,27 @@ void _mi_arena_segment_mark_abandoned(mi_memid_t memid)
   mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
 }
 
-// reclaim abandoned segments 
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id, size_t* previous_idx ) 
-{
-  if (mi_atomic_load_relaxed(&abandoned_count) == 0) return false;
+// start a cursor at a randomized arena
+void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
+  current->count = 0;
+  current->bitmap_idx = 0;  
+}
 
+// reclaim abandoned segments 
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
+{
   const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
-  int arena_idx = *previous_id;
-  size_t field_idx = mi_bitmap_index_field(*previous_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(*previous_idx) + 1;
+  if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
+
+  int count = previous->count;
+  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
+  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
   // visit arena's (from previous)
-  for( ; arena_idx < max_arena; arena_idx++, field_idx = 0, bit_idx = 0) {
+  for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
+    mi_arena_id_t arena_idx = previous->start + count;
+    if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
     if (arena != NULL) {
       // visit the abandoned fields (starting at previous_idx)
@@ -801,8 +811,8 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id,
               // try to reclaim it atomically
               if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
                 mi_atomic_decrement_relaxed(&abandoned_count);
-                *previous_idx = bitmap_idx;
-                *previous_id = arena_idx;
+                previous->bitmap_idx = bitmap_idx;
+                previous->count = count;
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
                 //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
                 return (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
@@ -814,8 +824,8 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_id_t* previous_id,
     }
   }
   // no more found
-  *previous_idx = 0;
-  *previous_id = 0;
+  previous->bitmap_idx = 0;
+  previous->count = 0;
   return NULL;
 }
 
diff --git a/src/segment.c b/src/segment.c
index 36bf2dfc..f7a43abf 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -910,22 +910,19 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
-  mi_arena_id_t current_id = 0;
-  size_t        current_idx = 0;
-  while ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL) {
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
 
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
-  *reclaimed = false;
-  
+  *reclaimed = false;  
   mi_segment_t* segment;
-  mi_arena_id_t current_id = 0;
-  size_t        current_idx = 0;
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
   long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 1024);     // limit the work to bound allocation times
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current_id, &current_idx)) != NULL)) 
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) 
   {
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments

From c4f1f2e079945fabd0cc9e56ad7aa81b6aed5bba Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 14:57:46 -0800
Subject: [PATCH 014/119] make reclaim tries a percentage of the abandoned
 count

---
 include/mimalloc/internal.h |  1 +
 src/arena.c                 | 10 +++++++---
 src/options.c               |  2 +-
 src/segment.c               |  8 +++++++-
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 0976fc4e..1168d944 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -126,6 +126,7 @@ void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
 bool       _mi_arena_segment_clear_abandoned(mi_memid_t memid);
 void       _mi_arena_segment_mark_abandoned(mi_memid_t memid);
+size_t     _mi_arena_segment_abandoned_count(void);
 
 typedef struct mi_arena_field_cursor_s { // abstract
   mi_arena_id_t  start;   
diff --git a/src/arena.c b/src/arena.c
index a73c5f52..af68c3bd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -733,14 +733,18 @@ bool _mi_arena_contains(const void* p) {
   This is used to atomically abandon/reclaim segments 
   (and crosses the arena API but it is convenient to have here).
   Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees in it, or when a thread needs a fresh
+  when a thread frees a block in it, or when a thread needs a fresh
   segment; these threads scan the abandoned segments through
   the arena bitmaps.
 ----------------------------------------------------------- */
 
-// Maintain these for debug purposes
+// Maintain a count of all abandoned segments
 static mi_decl_cache_align _Atomic(size_t)abandoned_count;
 
+size_t _mi_arena_segment_abandoned_count(void) {
+  return mi_atomic_load_relaxed(&abandoned_count);
+}
+
 // reclaim a specific abandoned segment; `true` on success.
 bool _mi_arena_segment_clear_abandoned(mi_memid_t memid ) 
 {
@@ -885,7 +889,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   // consequetive bitmaps
   arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
   arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
-  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandonde bitmap
+  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap
   arena->blocks_purge     = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap
   // initialize committed bitmap?
   if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
diff --git a/src/options.c b/src/options.c
index 61ed5be7..f8e928d0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -81,7 +81,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
   { 16,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
-  { 16,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. number of segment reclaims from the abandoned segments per try.
+  { 10,  UNINIT, MI_OPTION(max_segment_reclaim)},       // max. percentage of the abandoned segments per try.
   { 0,   UNINIT, MI_OPTION(destroy_on_exit)},           // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
   #if (MI_INTPTR_SIZE>4)
   { 1024L * 1024L, UNINIT, MI_OPTION(arena_reserve) },  // reserve memory N KiB at a time
diff --git a/src/segment.c b/src/segment.c
index f7a43abf..cc5c15fa 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -921,7 +921,13 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
   *reclaimed = false;  
   mi_segment_t* segment;
   mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
-  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 1024);     // limit the work to bound allocation times
+  
+  // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
+  const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
+  if (perc <= 0) return NULL;
+  const size_t abandoned_count = _mi_arena_segment_abandoned_count();
+  const size_t relative_count = (abandoned_count > 10000 ? (abandoned_count / 100) * perc : (abandoned_count * perc) / 100); // avoid overflow
+  long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) 
   {
     segment->abandoned_visits++;

From f5f61a65f544fc24cd87080d264647714b6c30da Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 15:14:39 -0800
Subject: [PATCH 015/119] bump version to 1.8.4

---
 cmake/mimalloc-config-version.cmake | 2 +-
 include/mimalloc.h                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index 923c0e14..e9b7d113 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 1)
 set(mi_version_minor 8)
-set(mi_version_patch 2)
+set(mi_version_patch 4)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 7b1b7d5a..205d09bf 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 182   // major + 2 digits minor
+#define MI_MALLOC_VERSION 184   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes

From 16c0948ee568c11eaa0c14909e236e40548f16c9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 16:24:28 -0800
Subject: [PATCH 016/119] improve display of arenas and contained blocks

---
 include/mimalloc.h |  2 ++
 src/arena.c        | 45 ++++++++++++++++++++++++++++++++++-----------
 test/test-stress.c |  3 ++-
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 205d09bf..e6693899 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -274,6 +274,8 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size
 mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
 mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
 
+mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept;
+
 // Experimental: heaps associated with specific memory arena's
 typedef int mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
diff --git a/src/arena.c b/src/arena.c
index af68c3bd..9f3e38d2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -949,32 +949,55 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe
   Debugging
 ----------------------------------------------------------- */
 
-static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) {
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bcount = 0;
   size_t inuse_count = 0;
   for (size_t i = 0; i < field_count; i++) {
     char buf[MI_BITMAP_FIELD_BITS + 1];
     uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
-    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) {
-      bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
-      if (inuse) inuse_count++;
-      buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.');
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) {
+      if (bcount < block_count) {
+        bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+        if (inuse) inuse_count++;
+        buf[bit] = (inuse ? 'x' : '.');
+      }
+      else {
+        buf[bit] = ' ';
+      }
     }
     buf[MI_BITMAP_FIELD_BITS] = 0;
-    _mi_verbose_message("%s%s\n", prefix, buf);
+    _mi_verbose_message("%s  %s\n", prefix, buf);
   }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, inuse_count);
   return inuse_count;
 }
 
-void mi_debug_show_arenas(void) mi_attr_noexcept {
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
   size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t inuse_total = 0;
+  size_t abandoned_total = 0;
+  size_t purge_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena == NULL) break;
-    size_t inuse_count = 0;
-    _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count);
-    inuse_count += mi_debug_show_bitmap("  ", arena->blocks_inuse, arena->field_count);
-    _mi_verbose_message("  blocks in use ('x'): %zu\n", inuse_count);
+    _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      inuse_total += mi_debug_show_bitmap("  ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count);
+    }
+    if (arena->blocks_committed != NULL) {
+      mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count);
+    }
+    if (show_abandoned) {
+      abandoned_total += mi_debug_show_bitmap("  ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count);      
+    }
+    if (show_purge && arena->blocks_purge != NULL) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count);
+    }
   }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", inuse_total);
+  if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
 }
 
 
diff --git a/test/test-stress.c b/test/test-stress.c
index 8c81c009..7e6e9645 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -267,7 +267,8 @@ int main(int argc, char** argv) {
 
 #ifndef USE_STD_MALLOC
   #ifndef NDEBUG
-  mi_collect(true);
+  // mi_collect(true);
+  mi_debug_show_arenas(true,true,true);
   #endif
   mi_stats_print(NULL);
 #endif

From 500d2ad1fde98e1ffe53997790afd6e58cae69a3 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 18:47:59 -0800
Subject: [PATCH 017/119] ensure non-arena segments are force purged on
 abandonment; make non-arena reclaim atomic

---
 include/mimalloc/internal.h |  4 ++--
 src/arena.c                 | 40 +++++++++++++++++++++++++++++--------
 src/segment.c               | 34 ++++++++++++++++++-------------
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 1168d944..ae19cfb3 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -124,8 +124,8 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
-bool       _mi_arena_segment_clear_abandoned(mi_memid_t memid);
-void       _mi_arena_segment_mark_abandoned(mi_memid_t memid);
+bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
+void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 size_t     _mi_arena_segment_abandoned_count(void);
 
 typedef struct mi_arena_field_cursor_s { // abstract
diff --git a/src/arena.c b/src/arena.c
index 9f3e38d2..0875e49e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -746,17 +746,32 @@ size_t _mi_arena_segment_abandoned_count(void) {
 }
 
 // reclaim a specific abandoned segment; `true` on success.
-bool _mi_arena_segment_clear_abandoned(mi_memid_t memid ) 
+bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
 {
-  if (memid.memkind != MI_MEM_ARENA) return true;  // not in an arena, consider it un-abandoned
+  if (segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena, consider it un-abandoned now.
+    // but we need to still claim it atomically -- we use the thread_id for that.
+    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, 0, _mi_thread_id())) {
+      mi_atomic_decrement_relaxed(&abandoned_count);
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  // arena segment: use the blocks_abandoned bitmap.
   size_t arena_idx;
   size_t bitmap_idx;
-  mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
   mi_assert_internal(arena_idx < MI_MAX_ARENAS);
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
   bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) { mi_atomic_decrement_relaxed(&abandoned_count); }
+  if (was_marked) { 
+    mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+    mi_atomic_decrement_relaxed(&abandoned_count); 
+    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
+  }
   // mi_assert_internal(was_marked);
   mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
   //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
@@ -764,12 +779,18 @@ bool _mi_arena_segment_clear_abandoned(mi_memid_t memid )
 }
 
 // mark a specific segment as abandoned
-void _mi_arena_segment_mark_abandoned(mi_memid_t memid) 
+void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) 
 {
-  if (memid.memkind != MI_MEM_ARENA) return;  // not in an arena
+  mi_atomic_store_release(&segment->thread_id, 0);
+  mi_assert_internal(segment->used == segment->abandoned);
+  if (segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena; count it as abandoned and return
+    mi_atomic_increment_relaxed(&abandoned_count);
+    return;
+  }
   size_t arena_idx;
   size_t bitmap_idx;
-  mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
   mi_assert_internal(arena_idx < MI_MAX_ARENAS);
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
@@ -818,8 +839,11 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
                 previous->bitmap_idx = bitmap_idx;
                 previous->count = count;
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+                mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+                mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+                mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
                 //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                return (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+                return segment;
               }
             }
           }
diff --git a/src/segment.c b/src/segment.c
index cc5c15fa..87cbfbb5 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -767,18 +767,21 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used > 0);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
 
-  // remove the segment from the free page queue if needed
+  // Potentially force purge. Only abandoned segments in arena memory can be
+  // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative.
   mi_pages_try_purge(tld);
-  mi_segment_remove_all_purges(segment, mi_option_is_enabled(mi_option_abandoned_page_purge), tld);
+  const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) ||  mi_option_is_enabled(mi_option_abandoned_page_purge);
+  mi_segment_remove_all_purges(segment, force_purge, tld);
+
+  // remove the segment from the free page queue if needed
   mi_segment_remove_from_free_queue(segment, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
 
   // all pages in the segment are abandoned; add it to the abandoned list
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
-  segment->thread_id = 0;
   segment->abandoned_visits = 0;
-  _mi_arena_segment_mark_abandoned(segment->memid);
+  _mi_arena_segment_mark_abandoned(segment);
 }
 
 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
@@ -842,8 +845,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-
-  segment->thread_id = _mi_thread_id();
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
   segment->abandoned_visits = 0;
   mi_segments_track_size((long)segment->segment_size, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
@@ -900,7 +902,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
-  if (_mi_arena_segment_clear_abandoned(segment->memid)) {  // atomically unabandon
+  if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
     mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
     mi_assert_internal(res == segment);
     return (res != NULL);
@@ -916,18 +918,22 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   }
 }
 
-static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
-{
-  *reclaimed = false;  
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
-  
+static long mi_segment_get_reclaim_tries(void) {
   // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return NULL;
   const size_t abandoned_count = _mi_arena_segment_abandoned_count();
   const size_t relative_count = (abandoned_count > 10000 ? (abandoned_count / 100) * perc : (abandoned_count * perc) / 100); // avoid overflow
   long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));
+  return max_tries;
+}
+
+static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
+{
+  *reclaimed = false;  
+  mi_segment_t* segment;
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
+  long max_tries = mi_segment_get_reclaim_tries();
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) 
   {
     segment->abandoned_visits++;
@@ -957,7 +963,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
     else {
       // otherwise, mark it back as abandoned
       // todo: reset delayed pages in the segment?
-      _mi_arena_segment_mark_abandoned(segment->memid);
+      _mi_arena_segment_mark_abandoned(segment);
     }
   }
   return NULL;

From 5ce1a9bfef782d3df453d8125b71b05499081950 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 18:58:11 -0800
Subject: [PATCH 018/119] fix cas call

---
 src/arena.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index 0875e49e..00132bf8 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -751,7 +751,8 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment )
   if (segment->memid.memkind != MI_MEM_ARENA) {
     // not in an arena, consider it un-abandoned now.
     // but we need to still claim it atomically -- we use the thread_id for that.
-    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, 0, _mi_thread_id())) {
+    size_t expected = 0;
+    if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
       mi_atomic_decrement_relaxed(&abandoned_count);
       return true;
     }

From 3090f23c258a80daa92c5c98d2efff4dd95f9c52 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 18:59:13 -0800
Subject: [PATCH 019/119] fix return value of get_reclaim_tries

---
 src/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/segment.c b/src/segment.c
index 87cbfbb5..3db4e813 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -921,7 +921,7 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
 static long mi_segment_get_reclaim_tries(void) {
   // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
-  if (perc <= 0) return NULL;
+  if (perc <= 0) return 0;
   const size_t abandoned_count = _mi_arena_segment_abandoned_count();
   const size_t relative_count = (abandoned_count > 10000 ? (abandoned_count / 100) * perc : (abandoned_count * perc) / 100); // avoid overflow
   long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));

From 8045d5517943b3b3652f2e6a250220135c74a275 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 1 Mar 2024 19:04:34 -0800
Subject: [PATCH 020/119] fix type warning on clang

---
 src/arena.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 00132bf8..e08ea22a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -826,12 +826,12 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
     if (arena != NULL) {
       // visit the abandoned fields (starting at previous_idx)
       for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        mi_bitmap_field_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
+        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
         if mi_unlikely(field != 0) { // skip zero fields quickly
           // visit each set bit in the field  (todo: maybe use `ctz` here?)
           for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
             // pre-check if the bit is set
-            mi_bitmap_field_t mask = ((mi_bitmap_field_t)1 << bit_idx);
+            size_t mask = ((size_t)1 << bit_idx);
             if mi_unlikely((field & mask) == mask) {
               mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
               // try to reclaim it atomically

From 3966953b7f0f11d2ec33097c5da4356d5b7db7e8 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 2 Mar 2024 11:50:57 -0800
Subject: [PATCH 021/119] prefer using __builtin_thread_pointer over assembly
 primitives. Fixes #851 and #852 as well.

---
 CMakeLists.txt          |  4 +--
 include/mimalloc/prim.h | 71 ++++++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1387e0db..bd98019b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,7 @@ endif()
 
 if(MI_SECURE)
   message(STATUS "Set full secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=4)  
+  list(APPEND mi_defines MI_SECURE=4)
 endif()
 
 if(MI_TRACK_VALGRIND)
@@ -468,7 +468,7 @@ if (MI_BUILD_OBJECT)
     set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}")
     set(mimalloc-obj-out    "${CMAKE_CURRENT_BINARY_DIR}/${mi_basename}${CMAKE_C_OUTPUT_EXTENSION}")
     add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}")
-    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})      
+    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})
   endif()
 
   # the following seems to lead to cmake warnings/errors on some systems, disable for now :-(
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 9e560696..c3844d8b 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -35,10 +35,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config );
 
 // Free OS memory
 int _mi_prim_free(void* addr, size_t size );
-  
+
 // Allocate OS memory. Return NULL on error.
 // The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
-// If `commit` is false, the virtual memory range only needs to be reserved (with no access) 
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
 // pre: !commit => !allow_large
@@ -82,11 +82,11 @@ mi_msecs_t _mi_prim_clock_now(void);
 typedef struct mi_process_info_s {
   mi_msecs_t  elapsed;
   mi_msecs_t  utime;
-  mi_msecs_t  stime; 
-  size_t      current_rss; 
-  size_t      peak_rss;  
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
   size_t      current_commit;
-  size_t      peak_commit; 
+  size_t      peak_commit;
   size_t      page_faults;
 } mi_process_info_t;
 
@@ -117,7 +117,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
-// 
+//
 // Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms as
 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
@@ -125,26 +125,14 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // for each thread (unequal to zero).
 //-------------------------------------------------------------------
 
-// defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
-
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
-// both the OS and libc implementation so we use specific tests for each main platform.
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
 // If you test on another platform and it works please send a PR :-)
 // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
-#elif defined(__GNUC__) && ( \
+//
+// Note: on most platforms this is not actually used anymore as we prefer `__builtin_thread_pointer()` nowadays.
+// However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
+#if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
@@ -152,6 +140,8 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
       )
 
+#define MI_HAS_TLS_SLOT
+
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   void* res;
   const size_t ofs = (slot*sizeof(void*));
@@ -205,6 +195,33 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 }
 
+#endif
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+      (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms
+  return (uintptr_t)__builtin_thread_pointer();
+}
+
+#elif defined(MI_HAS_TLS_SLOT)
+
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
@@ -251,7 +268,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__APPLE__) // macOS
   #define MI_TLS_SLOT               89  // seems unused?
-  // #define MI_TLS_RECURSE_GUARD 1
   // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
   // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
 #elif defined(__OpenBSD__)
@@ -269,6 +285,9 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 
 
 #if defined(MI_TLS_SLOT)
+# if !defined(MI_HAS_TLS_SLOT)
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined
+# endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);

From dfb5cadf33437aba9372e0d580022101deaee1d6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:06:34 -0800
Subject: [PATCH 022/119] don't use the new __builtin_thread_pointer on macOS

---
 include/mimalloc/prim.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index c3844d8b..830b36c4 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -213,11 +213,12 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 }
 
 #elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+      (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
       (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Works on most Unix based platforms
-  return (uintptr_t)__builtin_thread_pointer();
+  return (uintptr_t)__builtin_thread_pointer();  
 }
 
 #elif defined(MI_HAS_TLS_SLOT)

From 1f2d799ed0684e94489802502a709822581a537d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:14:59 -0800
Subject: [PATCH 023/119] possible fix for #855

---
 src/segment-map.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/segment-map.c b/src/segment-map.c
index 4c2104bd..a306ec67 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -29,6 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
 
 static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  // note: segment can be invalid or NULL.
   mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
   if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
     *bitidx = 0;
@@ -70,8 +71,7 @@ void _mi_segment_map_freed_at(const mi_segment_t* segment) {
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
   if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
+  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
   size_t bitidx;
   size_t index = mi_segment_map_index_of(segment, &bitidx);
   // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge

From 89afa14045b9bceac4e93cb54ea02799ebc57f45 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:25:16 -0800
Subject: [PATCH 024/119] fix build on illumos; by @dancrossnyc, issue #841

---
 src/prim/unix/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 54bf57b2..d99f6097 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -310,7 +310,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       #elif defined(__sun)
       if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
         struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_pagesize = _mi_os_large_page_size();
         cmd.mha_cmd = MHA_MAPSIZE_VA;
         if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
           *is_large = true;

From bccf10e1643bb25fce0d83f72e531752a07585e8 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:49:37 -0800
Subject: [PATCH 025/119] allow random fallback on older macOS versions, issue
 #829

---
 src/prim/unix/prim.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index d99f6097..91fa6508 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -37,6 +37,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/mman.h>
   #endif
 #elif defined(__APPLE__)
+  #include <AvailabilityMacros.h>
   #include <TargetConditionals.h>
   #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
   #include <mach/vm_statistics.h>
@@ -55,12 +56,14 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/syscall.h>
 #endif
 
+
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is 
 // still initializing (issue #713)
 //------------------------------------------------------------------------------------
 
+
 #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
 
 static int mi_prim_open(const char* fpath, int open_flags) {
@@ -76,7 +79,9 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif !defined(__APPLE__)  // avoid unused warnings
+#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
+
+#include <fcntl.h>
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
@@ -731,28 +736,20 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 // Random
 //----------------------------------------------------------------
 
-#if defined(__APPLE__)
-
-#include <AvailabilityMacros.h>
-#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070
 #include <CommonCrypto/CommonCryptoError.h>
 #include <CommonCrypto/CommonRandom.h>
-#endif
+
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
-    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
-  #else
-    // fall back on older macOS
-    arc4random_buf(buf, buf_len);
-    return true;
-  #endif
+  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);  
 }
 
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) 
+      defined(__sun) || \
+      (defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -760,7 +757,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   return true;
 }
 
-#elif defined(__linux__) || defined(__HAIKU__)
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // for old apple versions < 1070 (issue #829)
 
 #include <sys/types.h>
 #include <sys/stat.h>

From d21f60f71281fdc0dcabb77be01d4d897eb09ca0 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:00:31 -0800
Subject: [PATCH 026/119] add emscripten WASM support; this PR #822 written by
 Alon Zakai @kripken

---
 src/prim/emscripten/prim.c | 251 +++++++++++++++++++++++++++++++++++++
 src/prim/prim.c            |   3 +
 2 files changed, 254 insertions(+)
 create mode 100644 src/prim/emscripten/prim.c

diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
new file mode 100644
index 00000000..c0fa0f4a
--- /dev/null
+++ b/src/prim/emscripten/prim.c
@@ -0,0 +1,251 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+// Design
+// ======
+//
+// mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top
+// of sbrk. The reason for having three layers here is that we want mimalloc to
+// be able to allocate and release system memory properly, the same way it would
+// when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited.
+// Specifically, sbrk can only go up and down, and not "skip" over regions, and
+// so we end up either never freeing memory to the system, or we can get stuck
+// with holes.
+//
+// Atm wasm generally does *not* free memory back the system: once grown, we do
+// not shrink back down (https://github.com/WebAssembly/design/issues/1397).
+// However, that is expected to improve
+// (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md)
+// and so we do not want to bake those limitations in here.
+//
+// Even without that issue, we want our system allocator to handle holes, that
+// is, it should merge freed regions and allow allocating new content there of
+// the full size, etc., so that we do not waste space. That means that the
+// system allocator really does need to handle the general problem of allocating
+// and freeing variable-sized chunks of memory in a random order, like malloc/
+// free do. And so it makes sense to layer mimalloc on top of such an
+// implementation.
+//
+// emmalloc makes sense for the lower level because it is small and simple while
+// still fully handling merging of holes etc. It is not the most efficient
+// allocator, but our assumption is that mimalloc needs to be fast while the
+// system allocator underneath it is called much less frequently.
+//
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;
+  config->must_free_whole = true;
+  config->has_virtual_reserve = false;
+}
+
+extern void emmalloc_free(void*);
+
+int _mi_prim_free(void* addr, size_t size) {
+  MI_UNUSED(size);
+  emmalloc_free(addr);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation
+//---------------------------------------------
+
+extern void* emmalloc_memalign(size_t, size_t);
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit);
+  *is_large = false;
+  // TODO: Track the highest address ever seen; first uses of it are zeroes.
+  //       That assumes no one else uses sbrk but us (they could go up,
+  //       scribble, and then down), but we could assert on that perhaps.
+  *is_zero = false;
+  // emmalloc has some limitations on alignment size.
+  // TODO: Why does mimalloc ask for an align of 4MB? that ends up allocating
+  //       8, which wastes quite a lot for us in wasm. If that is unavoidable,
+  //       we may want to improve emmalloc to support such alignment. See also
+  //       https://github.com/emscripten-core/emscripten/issues/20645
+  #define MIN_EMMALLOC_ALIGN           8
+  #define MAX_EMMALLOC_ALIGN (1024*1024)
+  if (try_alignment < MIN_EMMALLOC_ALIGN) {
+    try_alignment = MIN_EMMALLOC_ALIGN;
+  } else if (try_alignment > MAX_EMMALLOC_ALIGN) {
+    try_alignment = MAX_EMMALLOC_ALIGN;
+  }
+  void* p = emmalloc_memalign(try_alignment, size);
+  *addr = p;
+  if (p == 0) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // See TODO above.
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <emscripten/html5.h>
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  return emscripten_date_now();
+}
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+#include <emscripten/console.h>
+
+void _mi_prim_out_stderr( const char* msg) {
+  emscripten_console_error(msg);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // For code size reasons, do not support environ customization for now.
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  int err = getentropy(buf, buf_len);
+  return !err;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default heap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_heap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing to do
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+
+}
+#endif
diff --git a/src/prim/prim.c b/src/prim/prim.c
index 9a597d8e..3b7d3736 100644
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@@ -18,6 +18,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_USE_SBRK
 #include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
 
+#elif defined(__EMSCRIPTEN__)
+#include "emscripten/prim.c" // emmalloc_*, + pthread support
+
 #else
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
 

From 98abfe042cbb168309832b744bbed982d81bba6b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:08:22 -0800
Subject: [PATCH 027/119] avoid syscall on openBSD, issue #821 by @blackgnezdo

---
 src/prim/unix/prim.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 91fa6508..5a1088e0 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -27,10 +27,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
-
+#include <fcntl.h>     // open, close, read, access
+  
 #if defined(__linux__)
   #include <features.h>
-  #include <fcntl.h>
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
@@ -51,7 +51,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__)
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
@@ -81,8 +81,6 @@ static int mi_prim_access(const char *fpath, int mode) {
 
 #elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
 
-#include <fcntl.h>
-
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
 }
@@ -761,7 +759,6 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
 #include <errno.h>
 
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {

From 5634527fae4827d1ed470e093c2bac403cf82aaf Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:26:42 -0800
Subject: [PATCH 028/119] add terminating quote

---
 include/mimalloc/prim.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 830b36c4..1a6adcc7 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -287,7 +287,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 
 #if defined(MI_TLS_SLOT)
 # if !defined(MI_HAS_TLS_SLOT)
-#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
 # endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {

From b7d44378bb2840dcad4db5d921e552ddbc960361 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:32:35 -0800
Subject: [PATCH 029/119] avoid unused function warning on Solaris, PR #830 by
 @kulikjak

---
 src/prim/unix/prim.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 5a1088e0..94ae7946 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -51,7 +51,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__)
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__) && !defined(__sun)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
@@ -79,7 +79,7 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
+#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070) && !defined(__sun) // avoid unused warnings on macOS and Solaris
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);

From cc4500a024cc315f368e971fab556b6c17ec14ed Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:36:57 -0800
Subject: [PATCH 030/119] ensure consistent types for template deduction, PR
 #834 by @dg0yt

---
 src/arena.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 790b6ac1..09afd890 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -482,7 +482,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
     // schedule decommit
     mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
     if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, delay/10);  // add smallish extra delay
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
     else {
       mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
@@ -526,7 +526,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   if (!force && expire > now) return false;
 
   // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
 
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;

From 944ec1ab8acfc38137f716d85ceb7f896a39f852 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:47:07 -0800
Subject: [PATCH 031/119] Fix error: cannot use 'throw' with exceptions
 disabled #815, by @sergio-nsk

---
 src/alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/alloc.c b/src/alloc.c
index b17fdbdc..484a3e5b 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -908,9 +908,13 @@ static bool mi_try_new_handler(bool nothrow) {
   #endif
   if (h==NULL) {
     _mi_error_message(ENOMEM, "out of memory in 'new'");
+    #if defined(_CPPUNWIND) || defined(__cpp_exceptions)  // exceptions are not always enabled
     if (!nothrow) {
       throw std::bad_alloc();
     }
+    #else
+    MI_UNUSED(nothrow);
+    #endif
     return false;
   }
   else {

From 7b398ad9244769974cc63d88c5e2dc5525fa02a2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:51:51 -0800
Subject: [PATCH 032/119] delete pthread key at shutdown, PR #810 by
 @jkriegshauser

---
 src/prim/unix/prim.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 94ae7946..4490c058 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -829,7 +829,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // nothing to do
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {

From 3fe3d540b67f69d4bb12bfa88c993b12af0da424 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:57:54 -0800
Subject: [PATCH 033/119] Fix incorrect MAP_HUGE_1GB check #793

---
 src/prim/unix/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 4490c058..3e159e6c 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -279,7 +279,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
         *is_large = true;
         p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);

From 5d22157dc8711e547608c7f31f1560b7e814d6c4 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:41:49 -0800
Subject: [PATCH 034/119] support tls_slot for PPC #781, by @barracuda156

---
 include/mimalloc/prim.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 1a6adcc7..d14b885b 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -134,7 +134,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
 #if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
@@ -165,6 +165,9 @@ static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
   #endif
   return res;
 }
@@ -192,6 +195,9 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);    
   #endif
 }
 

From c541a9b32e1dce320e5636c925eb6eb799a21da7 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:44:06 -0800
Subject: [PATCH 035/119] Revert setting hardcoded install paths on Haiku #788,
 by @begasus

---
 CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd98019b..1a483ecd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,10 +263,11 @@ if(MI_USE_CXX)
   endif()
 endif()
 
-if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
-   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
-   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
- endif()
+# On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
+# if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
+#   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
+#   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
+# endif()
 
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")

From 128c7c1876f69812bfe497f2a3f49b5fbce9e0f8 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:55:13 -0800
Subject: [PATCH 036/119] cleanup thp disable a bit

---
 CMakeLists.txt       |  2 +-
 src/prim/unix/prim.c | 31 +++++++++++++++----------------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b5464226..af00e2ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clan
 option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF)
 option(MI_NO_PADDING        "Force no use of padding even in DEBUG mode etc." OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)
-option(MI_NO_THP            "Force disable transparent huge pages support on Linux/Android process wise only" OFF)
+option(MI_NO_THP            "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF)
 
 # deprecated options
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 33b83322..2035e1a4 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -31,7 +31,9 @@ terms of the MIT license. A copy of the license can be found in the file
   
 #if defined(__linux__)
   #include <features.h>
+  #if defined(MI_NO_THP)
   #include <sys/prctl.h>
+  #endif
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
@@ -129,21 +131,8 @@ static bool unix_detect_overcommit(void) {
   return os_overcommit;
 }
 
-void unix_set_thp(void) {
-#if defined(__linux__) || defined(__ANDROID__)
-#if MI_NO_THP
-  int val;
-  if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
-    // Most likely since distros often come with always/madvise settings.
-    val = 1;
-    // Disabling only for mimalloc process rather than touching system wide settings
-    (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
-  }
-#endif
-#endif
-}
-
-void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
+void _mi_prim_mem_init( mi_os_mem_config_t* config ) 
+{
   long psize = sysconf(_SC_PAGESIZE);
   if (psize > 0) {
     config->page_size = (size_t)psize;
@@ -153,7 +142,17 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->has_overcommit = unix_detect_overcommit();
   config->must_free_whole = false;    // mmap can free in parts
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
-  unix_set_thp();
+
+  // disable transparent huge pages for this process?
+  #if defined(MI_NO_THP) && (defined(__linux__) || defined(__ANDROID__))
+  int val = 0;
+  if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+    // Most likely since distros often come with always/madvise settings.
+    val = 1;
+    // Disabling only for mimalloc process rather than touching system wide settings
+    (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+  }
+  #endif
 }
 
 

From 16c3f1292c419eb928b7a9b8c0b9f3dc6733ba56 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:57:10 -0800
Subject: [PATCH 037/119] Fix compile error on OpenBSD #773, by @sundb

---
 src/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index fe885a5b..fda17d70 100644
--- a/src/init.c
+++ b/src/init.c
@@ -425,7 +425,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
   #if defined(MI_TLS_SLOT)
   mi_prim_tls_slot_set(MI_TLS_SLOT,heap);
   #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
-  *mi_tls_pthread_heap_slot() = heap;
+  *mi_prim_tls_pthread_heap_slot() = heap;
   #elif defined(MI_TLS_PTHREAD)
   // we use _mi_heap_default_key
   #else

From e1f6516fdade536905ebbcf76d4f7f1911019359 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 17:05:29 -0800
Subject: [PATCH 038/119] Fix undefined symbol errors when building for wasi
 #758, by @anuraaga

---
 include/mimalloc/internal.h | 6 ++++++
 src/os.c                    | 4 ----
 src/prim/wasi/prim.c        | 5 +++++
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 40401736..21cb42bf 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -296,6 +296,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
   }
 }
 
+// Align a pointer upwards
+static inline void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+
 // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
 static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   mi_assert_internal(divider != 0);
diff --git a/src/os.c b/src/os.c
index 69ad2bf9..b98950a4 100644
--- a/src/os.c
+++ b/src/os.c
@@ -73,10 +73,6 @@ void _mi_os_init(void) {
 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
 
-static void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
-}
-
 static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
   mi_assert_internal(alignment != 0);
   uintptr_t mask = alignment - 1;
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index 50511f0b..f74acd2a 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -12,6 +12,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
+#include <stdio.h>   // fputs
+#include <stdlib.h>  // getenv
+
 //---------------------------------------------
 // Initialize
 //---------------------------------------------
@@ -40,6 +43,8 @@ int _mi_prim_free(void* addr, size_t size ) {
 //---------------------------------------------
 
 #if defined(MI_USE_SBRK)
+  #include <unistd.h>  // for sbrk
+
   static void* mi_memory_grow( size_t size ) {
     void* p = sbrk(size);
     if (p == (void*)(-1)) return NULL;

From 7df348e2eaf4c67454f9efe3c3aabd4e7a4417dd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 17:07:09 -0800
Subject: [PATCH 039/119] fix typo, #756

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af00e2ee..0cc7e575 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -248,7 +248,7 @@ if(MI_DEBUG_UBSAN)
       message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")
     endif()
   else()
-    message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")
+    message(WARNING "Can only use undefined-behavior sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")
   endif()
 endif()
 

From 683332c950dff8260eae521e776bc5a51078a4f1 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 17:12:58 -0800
Subject: [PATCH 040/119] Define reallocarr as weak to avoid symbol collision
 on gnu/Linux. #751 by @Romain-Geissler-1A

---
 include/mimalloc/internal.h | 3 +++
 src/alloc-override.c        | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 21cb42bf..2e07f763 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -30,14 +30,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#define mi_decl_weak            
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
 #define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#define mi_decl_weak            __attribute__((weak))
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
+#define mi_decl_weak           
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
diff --git a/src/alloc-override.c b/src/alloc-override.c
index 873065dc..7cf0bf2c 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -259,10 +259,11 @@ extern "C" {
 // no forwarding here due to aliasing/name mangling issues
 void  cfree(void* p)                                    { mi_free(p); }
 void* pvalloc(size_t size)                              { return mi_pvalloc(size); }
-void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
-int   reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 void* memalign(size_t alignment, size_t size)           { return mi_memalign(alignment, size); }
 void* _aligned_malloc(size_t alignment, size_t size)    { return mi_aligned_alloc(alignment, size); }
+void* reallocarray(void* p, size_t count, size_t size)  { return mi_reallocarray(p, count, size); }
+// some systems define reallocarr so mark it as a weak symbol (#751)
+mi_decl_weak int reallocarr(void* p, size_t count, size_t size)    { return mi_reallocarr(p, count, size); }
 
 #if defined(__wasi__)
   // forward __libc interface (see PR #667)

From 5fa620ac2c9b1c01290cc5e0ae9a0815dbc52df6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 17:14:28 -0800
Subject: [PATCH 041/119] add bin/ to .gitattributes as export-ignore #746

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitattributes b/.gitattributes
index 0332e031..f083b107 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -10,3 +10,4 @@
 *.dll binary
 *.lib binary
 *.exe binary
+bin export-ignore

From c6750c51abd596335adef084f1addd6e5707b7ed Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 17:16:03 -0800
Subject: [PATCH 042/119] Fix new[] / delete mismatch. #745

---
 test/main-override.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/main-override.cpp b/test/main-override.cpp
index f9ac7327..64ea178b 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -100,7 +100,7 @@ static void various_tests() {
   auto tbuf = new unsigned char[sizeof(Test)];
   t = new (tbuf) Test(42);
   t->~Test();
-  delete tbuf;
+  delete[] tbuf;
 }
 
 class Static {

From 43d40704b95266e5ef92e136aa7c7407e5f8fc85 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 2 Mar 2024 18:24:53 -0800
Subject: [PATCH 043/119] keep threadid 0 for abandoned_next to preserve
 invariants

---
 src/arena.c   | 4 +++-
 src/segment.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index fc8a79c6..c971c12e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -749,6 +749,7 @@ size_t _mi_arena_segment_abandoned_count(void) {
 }
 
 // reclaim a specific abandoned segment; `true` on success.
+// sets the thread_id.
 bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
 {
   if (segment->memid.memkind != MI_MEM_ARENA) {
@@ -783,6 +784,7 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment )
 }
 
 // mark a specific segment as abandoned
+// clears the thread_id.
 void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) 
 {
   mi_atomic_store_release(&segment->thread_id, 0);
@@ -813,6 +815,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* curre
 }
 
 // reclaim abandoned segments 
+// this does not set the thread id (so it appears as still abandoned)
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
 {
   const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
@@ -845,7 +848,6 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
                 mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
                 mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
                 //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
                 return segment;
               }
diff --git a/src/segment.c b/src/segment.c
index 3db4e813..a50f0190 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -845,7 +845,9 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
+  // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
+  mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   segment->abandoned_visits = 0;
   mi_segments_track_size((long)segment->segment_size, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);

From cca3fc26b05a80cb90b45d4bed2af6584ff4327b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 08:41:25 -0800
Subject: [PATCH 044/119] fix macOS version checks, issues #829

---
 src/prim/unix/prim.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 2035e1a4..e6dd7091 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -82,7 +82,8 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070) && !defined(__sun) // avoid unused warnings on macOS and Solaris
+#elif !defined(__sun) && \
+      (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7)))  // avoid unused warnings on macOS and Solaris
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
@@ -749,7 +750,7 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 // Random
 //----------------------------------------------------------------
 
-#if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070
+#if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_15) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15)
 #include <CommonCrypto/CommonCryptoError.h>
 #include <CommonCrypto/CommonRandom.h>
 
@@ -762,7 +763,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
       defined(__sun) || \
-      (defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)
+      (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -770,7 +771,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   return true;
 }
 
-#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // for old apple versions < 1070 (issue #829)
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // also for old apple versions < 10.7 (issue #829)
 
 #include <sys/types.h>
 #include <sys/stat.h>

From 018c0ce2f432f62e7b6e35c3e35a62849a55bdc9 Mon Sep 17 00:00:00 2001
From: "microsoft-github-policy-service[bot]"
 <77245923+microsoft-github-policy-service[bot]@users.noreply.github.com>
Date: Sun, 3 Mar 2024 16:41:34 +0000
Subject: [PATCH 045/119] Microsoft mandatory file

---
 SECURITY.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..b3c89efc
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

From 027b22aaf25e3f76028d149a1b2a50ab92286699 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 09:37:46 -0800
Subject: [PATCH 046/119] add arena stats

---
 include/mimalloc/types.h |  3 +++
 src/arena.c              | 11 ++++++-----
 src/bitmap.c             | 12 ++++++++----
 src/bitmap.h             |  2 +-
 src/init.c               |  3 ++-
 src/stats.c              |  3 +++
 6 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 06b96587..9758bf48 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -547,6 +547,9 @@ typedef struct mi_stats_s {
   mi_stat_counter_t normal_count;
   mi_stat_counter_t huge_count;
   mi_stat_counter_t giant_count;
+  mi_stat_counter_t arena_count;
+  mi_stat_counter_t arena_crossover_count;
+  mi_stat_counter_t arena_rollback_count;
 #if MI_STAT>1
   mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
 #endif
diff --git a/src/arena.c b/src/arena.c
index 09afd890..b930958f 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -207,10 +207,10 @@ static void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
-  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
     mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
@@ -229,7 +229,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
 
   mi_bitmap_index_t bitmap_index;
-  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
+  if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL;
 
   // claimed it!
   void* p = mi_arena_block_start(arena, bitmap_index);
@@ -735,7 +735,7 @@ bool _mi_arena_contains(const void* p) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) {
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
   mi_assert_internal(arena->block_count > 0);
@@ -746,6 +746,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) {
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
+  mi_stat_counter_increase(stats->arena_count,1);
   arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
   if (arena_id != NULL) { *arena_id = arena->id; }
@@ -799,7 +800,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
     mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
     _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
   }
-  return mi_arena_add(arena, arena_id);
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index 01064140..017295e7 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -182,7 +182,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 // Try to atomically claim a sequence of `count` bits starting from the field
 // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
 // Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
   mi_assert_internal(bitmap_idx != NULL);
 
@@ -242,6 +242,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit
   } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
 
   // claimed!
+  mi_stat_counter_increase(stats->arena_crossover_count,1);
   *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
   return true;
 
@@ -261,9 +262,10 @@ rollback:
       newmap = (map & ~initial_mask);
     } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
   }
+  mi_stat_counter_increase(stats->arena_rollback_count,1);
   // retry? (we make a recursive call instead of goto to be able to use const declarations)
   if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
   }
   else {
     return false;
@@ -273,7 +275,7 @@ rollback:
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
   mi_assert_internal(count > 0);
   if (count <= 2) {
     // we don't bother with crossover fields for small counts
@@ -285,13 +287,15 @@ bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitm
   for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
     if (idx >= bitmap_fields) { idx = 0; } // wrap
     // first try to claim inside a field
+    /*
     if (count <= MI_BITMAP_FIELD_BITS) {
       if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
         return true;
       }
     }
+    */
     // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
       return true;
     }
   }
diff --git a/src/bitmap.h b/src/bitmap.h
index 266f140a..156c4386 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -90,7 +90,7 @@ bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t
 
 // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
 // Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
 
 // Set `count` bits at `bitmap_idx` to 0 atomically
 // Returns `true` if all `count` bits were 1 previously.
diff --git a/src/init.c b/src/init.c
index fda17d70..30211764 100644
--- a/src/init.c
+++ b/src/init.c
@@ -78,7 +78,8 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/stats.c b/src/stats.c
index fa947e5d..8fbdfc45 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -340,6 +340,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
   mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
   mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->arena_count, "arenas", out, arg);
+  mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg);
+  mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg);
   mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
   mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
   mi_stat_counter_print(&stats->reset_calls, "resets", out, arg);

From 2473676e18cb2beab4ddc7d668ef23265d7d280b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 09:54:49 -0800
Subject: [PATCH 047/119] further fixes to macOS version checks by ensuring
 MAC_OS_X_VERSION_10_7 is always defined (issue #829)

---
 src/prim/unix/prim.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index e6dd7091..87ad63b1 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -45,6 +45,9 @@ terms of the MIT license. A copy of the license can be found in the file
   #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
   #include <mach/vm_statistics.h>
   #endif
+  #if !defined(MAC_OS_X_VERSION_10_7)
+  #define MAC_OS_X_VERSION_10_7   1070
+  #endif
 #elif defined(__FreeBSD__) || defined(__DragonFly__)
   #include <sys/param.h>
   #if __FreeBSD_version >= 1200000
@@ -83,7 +86,7 @@ static int mi_prim_access(const char *fpath, int mode) {
 }
 
 #elif !defined(__sun) && \
-      (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7)))  // avoid unused warnings on macOS and Solaris
+      (!defined(__APPLE__) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7))  // avoid unused warnings on macOS and Solaris
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
@@ -763,7 +766,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
       defined(__sun) || \
-      (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
+      (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7))
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {

From fa10914f730cf63e562af55be8b6257b4a0d4862 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 13:19:27 -0800
Subject: [PATCH 048/119] track os stats in the main stats directly

---
 src/arena.c |  6 +++---
 src/os.c    | 25 ++++++++++++-------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index b930958f..e6b826f8 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -456,7 +456,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
     needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    _mi_stat_increase(&stats->committed, size);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
   }
 
   // clear the purged blocks
@@ -613,7 +613,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     // was a direct OS allocation, pass through
     if (!all_committed && committed_size > 0) {
       // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&stats->committed, committed_size);
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
     }
     _mi_os_free(p, size, memid, stats);
   }
@@ -656,7 +656,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         if (committed_size > 0) {
           // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
           // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&stats->committed, committed_size);
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
         }
         // note: if not all committed, it may be that the purge will reset/decommit the entire range
         // that contains already decommitted parts. Since purge consistently uses reset or decommit that
diff --git a/src/os.c b/src/os.c
index b98950a4..21ab9243 100644
--- a/src/os.c
+++ b/src/os.c
@@ -148,13 +148,13 @@ static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats);
 
 static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
   mi_assert_internal((size % _mi_os_page_size()) == 0);
   if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr)
   int err = _mi_prim_free(addr, size);
   if (err != 0) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
-  mi_stats_t* stats = &_mi_stats_main;
   if (still_committed) { _mi_stat_decrease(&stats->committed, size); }
   _mi_stat_decrease(&stats->reserved, size);
 }
@@ -195,20 +195,22 @@ void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats)
 -------------------------------------------------------------- */
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
-static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* stats) {
+static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) {
   mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
   mi_assert_internal(is_zero != NULL);
   mi_assert_internal(is_large != NULL);
   if (size == 0) return NULL;
   if (!commit) { allow_large = false; }
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
-
   *is_zero = false;
   void* p = NULL; 
   int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
   }
+  
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
@@ -296,10 +298,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   OS API: alloc and alloc_aligned
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) {
-  MI_UNUSED(tld_stats);
+void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   *memid = _mi_memid_none();
-  mi_stats_t* stats = &_mi_stats_main;
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   bool os_is_large = false;
@@ -311,10 +311,9 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* tld_stats) {
   return p;
 }
 
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats)
 {
   MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
-  MI_UNUSED(tld_stats);
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
@@ -323,7 +322,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
-  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, &_mi_stats_main /*tld->stats*/ );
+  void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats );
   if (p != NULL) {
     *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
@@ -340,7 +339,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   to use the actual start of the memory region.
 ----------------------------------------------------------- */
 
-void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats) {
+void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
   mi_assert(offset <= MI_SEGMENT_SIZE);
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
@@ -348,20 +347,20 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
   if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld_stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
   }
   else {
     // overallocate to align at an offset
     const size_t extra = _mi_align_up(offset, alignment) - offset;
     const size_t oversize = size + extra;
-    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, tld_stats);
+    void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats);
     if (start == NULL) return NULL;
 
     void* const p = (uint8_t*)start + extra;
     mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
     // decommit the overallocation at the start
     if (commit && extra > _mi_os_page_size()) {
-      _mi_os_decommit(start, extra, tld_stats);
+      _mi_os_decommit(start, extra, stats);
     }
     return p;
   }

From 719496bac9a22bbfa6750af08ed4ed3381a2d645 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 13:22:24 -0800
Subject: [PATCH 049/119] fix clang warning

---
 src/arena.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index e6b826f8..01030811 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -746,7 +746,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
-  mi_stat_counter_increase(stats->arena_count,1);
+  _mi_stat_counter_increase(&stats->arena_count,1);
   arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
   if (arena_id != NULL) { *arena_id = arena->id; }

From 5fe83bf327529b897e9dca1ec17b5da05d47513a Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 3 Mar 2024 17:38:50 -0800
Subject: [PATCH 050/119] limit reclaim from free to half the segments to
 prevent a pure freeing thread to reclaim too many segments

---
 include/mimalloc/types.h |  2 ++
 src/init.c               |  2 +-
 src/segment.c            | 18 ++++++++++++++++--
 test/test-stress.c       | 28 +++++++++++++++++-----------
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6e8815d9..771059bf 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -375,6 +375,7 @@ typedef struct mi_segment_s {
   // segment fields
   struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
   struct mi_segment_s* prev;
+  bool                 was_reclaimed;    // true if it was reclaimed (used to limit on-free reclamation)
 
   size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim if it is too long)
@@ -600,6 +601,7 @@ typedef struct mi_segments_tld_s {
   size_t              peak_count;   // peak number of segments
   size_t              current_size; // current size of all segments
   size_t              peak_size;    // peak size of all segments
+  size_t              reclaim_count;// number of reclaimed (abandoned) segments
   mi_stats_t*         stats;        // points to tld stats
   mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;
diff --git a/src/init.c b/src/init.c
index 30211764..7ec6e01e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -121,7 +121,7 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main, &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0,
+    0, 0, 0, 0, 0,
     &tld_main.stats, &tld_main.os
   }, // segments
   { 0, &tld_main.stats },  // os
diff --git a/src/segment.c b/src/segment.c
index a50f0190..00c9a570 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -473,6 +473,11 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   segment->thread_id = 0;
   _mi_segment_map_freed_at(segment);
   mi_segments_track_size(-((long)segment_size),tld);
+  if (segment->was_reclaimed) {
+    tld->reclaim_count--;
+    segment->was_reclaimed = false;
+  }
+
   if (MI_SECURE != 0) {
     mi_assert_internal(!segment->memid.is_pinned);
     mi_segment_protect(segment, false, tld->os); // ensure no more guard pages are set
@@ -488,7 +493,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   }
   MI_UNUSED(fully_committed);
   mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-
+  
   _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
@@ -781,6 +786,10 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
   segment->abandoned_visits = 0;
+  if (segment->was_reclaimed) {
+    tld->reclaim_count--;
+    segment->was_reclaimed = false;
+  }
   _mi_arena_segment_mark_abandoned(segment);
 }
 
@@ -849,6 +858,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
   mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   segment->abandoned_visits = 0;
+  segment->was_reclaimed = true;
+  tld->reclaim_count++;
   mi_segments_track_size((long)segment->segment_size, tld);
   mi_assert_internal(segment->next == NULL && segment->prev == NULL);
   mi_assert_expensive(mi_segment_is_valid(segment, tld));
@@ -904,8 +915,11 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
+  // don't reclaim more from a free than half the current segments
+  // this is to prevent a pure free-ing thread to start owning too many segments
+  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;  
   if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
-    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
+    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);    
     mi_assert_internal(res == segment);
     return (res != NULL);
   }
diff --git a/test/test-stress.c b/test/test-stress.c
index 7e6e9645..14b3c3ae 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -37,11 +37,12 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 // static int THREADS = 8;    // more repeatable if THREADS <= #processors
 // static int SCALE   = 100;  // scaling factor
 
-#define STRESS   // undefine for leak test
+#define STRESS                // undefine for leak test
 
 static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
+static bool   main_participates = false;       // main thread participates as a worker too
 
 // #define USE_STD_MALLOC
 #ifdef USE_STD_MALLOC
@@ -196,10 +197,13 @@ static void test_stress(void) {
         free_items(p);
       }
     }
-    // mi_collect(false);
-#if !defined(NDEBUG) || defined(MI_TSAN)
+    #ifndef NDEBUG
+    //mi_collect(false);
+    //mi_debug_show_arenas();
+    #endif    
+    #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
-#endif
+    #endif
   }
 }
 
@@ -292,14 +296,15 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
   thread_entry_fun = fun;
   DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
   HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
-  for (uintptr_t i = 1; i < nthreads; i++) {
+  const size_t start = (main_participates ? 1 : 0);
+  for (size_t i = start; i < nthreads; i++) {
     thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
   }
-  fun(0); // run the main thread as well
-  for (size_t i = 1; i < nthreads; i++) {
+  if (main_participates) fun(0); // run the main thread as well
+  for (size_t i = start; i < nthreads; i++) {
     WaitForSingleObject(thandles[i], INFINITE);
   }
-  for (size_t i = 1; i < nthreads; i++) {
+  for (size_t i = start; i < nthreads; i++) {
     CloseHandle(thandles[i]);
   }
   custom_free(tids);
@@ -326,12 +331,13 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
   thread_entry_fun = fun;
   pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t));
   memset(threads, 0, sizeof(pthread_t) * nthreads);
+  const size_t start = (main_participates ? 1 : 0);
   //pthread_setconcurrency(nthreads);
-  for (size_t i = 1; i < nthreads; i++) {
+  for (size_t i = start; i < nthreads; i++) {
     pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
   }
-  fun(0); // run the main thread as well
-  for (size_t i = 1; i < nthreads; i++) {
+  if (main_participates) fun(0); // run the main thread as well
+  for (size_t i = start; i < nthreads; i++) {
     pthread_join(threads[i], NULL);
   }
   custom_free(threads);

From da520480254b8af2ff9d7bf413d92c46c6db52dd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 3 Mar 2024 18:08:17 -0800
Subject: [PATCH 051/119] rename local to avoid warning in static build

---
 src/segment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/segment.c b/src/segment.c
index 00c9a570..a6522028 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -938,8 +938,8 @@ static long mi_segment_get_reclaim_tries(void) {
   // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return 0;
-  const size_t abandoned_count = _mi_arena_segment_abandoned_count();
-  const size_t relative_count = (abandoned_count > 10000 ? (abandoned_count / 100) * perc : (abandoned_count * perc) / 100); // avoid overflow
+  const size_t total_count = _mi_arena_segment_abandoned_count();
+  const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
   long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));
   return max_tries;
 }

From 2e96bc9ee46ae0ebbb0640cdacfe57813b519d54 Mon Sep 17 00:00:00 2001
From: Joshua Root <jmr@macports.org>
Date: Tue, 5 Mar 2024 01:12:06 +1100
Subject: [PATCH 052/119] Fix min macOS for pressure_relief

This field exists in the 10.7 and later SDKs.
---
 src/prim/osx/alloc-override-zone.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/prim/osx/alloc-override-zone.c b/src/prim/osx/alloc-override-zone.c
index 0e0a99d9..9a317750 100644
--- a/src/prim/osx/alloc-override-zone.c
+++ b/src/prim/osx/alloc-override-zone.c
@@ -225,7 +225,9 @@ static malloc_zone_t mi_malloc_zone = {
   // switch to version 9+ on OSX 10.6 to support memalign.
   .memalign = &zone_memalign,
   .free_definite_size = &zone_free_definite_size,
+  #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
   .pressure_relief = &zone_pressure_relief,
+  #endif
   #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14)
   .claimed_address = &zone_claimed_address,
   #endif

From f508ae552888d988cc828e70634061867e3a477b Mon Sep 17 00:00:00 2001
From: Joshua Root <jmr@macports.org>
Date: Tue, 5 Mar 2024 01:28:10 +1100
Subject: [PATCH 053/119] Only interpose strndup if it exists

Added in the macOS 10.7 SDK.
---
 src/alloc-override.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index 7cf0bf2c..b2c94ce2 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -77,7 +77,9 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
     MI_INTERPOSE_MI(calloc),
     MI_INTERPOSE_MI(realloc),
     MI_INTERPOSE_MI(strdup),
+    #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7
     MI_INTERPOSE_MI(strndup),
+    #endif
     MI_INTERPOSE_MI(realpath),
     MI_INTERPOSE_MI(posix_memalign),
     MI_INTERPOSE_MI(reallocf),

From db52999d8567c628154fcb002cf636bc1c1697d9 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 8 Mar 2024 09:01:26 -0800
Subject: [PATCH 054/119] update used block comment (issue #861)

---
 include/mimalloc/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 771059bf..049e68e7 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -295,7 +295,7 @@ typedef struct mi_page_s {
   uint8_t               retire_expire:7;   // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
 

From 068dc014ec5f2bc32133e986a273b69e0c8a936a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Teodor=20Sp=C3=A6ren?= <teodor@sparen.no>
Date: Sat, 9 Mar 2024 14:13:33 +0100
Subject: [PATCH 055/119] Avoid compilation error when passing in heap to
 allocators

Before it would not work to create the mi_heap_stl_allocator types with
passing in a "mi_heap_t*", since sizeof is used and it gives a
compilation error. This change fixes that.
---
 include/mimalloc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index e6693899..dfef2c69 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -487,6 +487,7 @@ template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const
 #define MI_HAS_HEAP_STL_ALLOCATOR 1
 
 #include <memory>      // std::shared_ptr
+#include "mimalloc/types.h"
 
 // Common base class for STL allocators in a specific heap
 template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {

From 3d89f6388e4b416a1ae588ef3fa28cdc54f67bf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Teodor=20Sp=C3=A6ren?= <teodor@sparen.no>
Date: Sat, 9 Mar 2024 15:14:32 +0100
Subject: [PATCH 056/119] Fix std::shared_pointer calling free on provided heap
 pointers

---
 include/mimalloc.h |  2 +-
 test/test-api.c    | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index dfef2c69..c125932f 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -495,7 +495,7 @@ template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : publi
   using typename _mi_stl_allocator_common<T>::value_type;
   using typename _mi_stl_allocator_common<T>::pointer;
 
-  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp) { }    /* will not delete nor destroy the passed in heap */
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp, [](mi_heap_t*) {}) {}    /* will not delete nor destroy the passed in heap */
 
   #if (__cplusplus >= 201703L)  // C++17
   mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
diff --git a/test/test-api.c b/test/test-api.c
index 8dd24e1b..edc506b3 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -46,6 +46,11 @@ bool test_heap2(void);
 bool test_stl_allocator1(void);
 bool test_stl_allocator2(void);
 
+bool test_stl_heap_allocator1(void);
+bool test_stl_heap_allocator2(void);
+bool test_stl_heap_allocator3(void);
+bool test_stl_heap_allocator4(void);
+
 bool mem_is_zero(uint8_t* p, size_t size) {
   if (p==NULL) return false;
   for (size_t i = 0; i < size; ++i) {
@@ -304,6 +309,11 @@ int main(void) {
   CHECK("stl_allocator1", test_stl_allocator1());
   CHECK("stl_allocator2", test_stl_allocator2());
 
+	CHECK("stl_heap_allocator1", test_stl_heap_allocator1());
+	CHECK("stl_heap_allocator2", test_stl_heap_allocator2());
+	CHECK("stl_heap_allocator3", test_stl_heap_allocator3());
+	CHECK("stl_heap_allocator4", test_stl_heap_allocator4());
+
   // ---------------------------------------------------
   // Done
   // ---------------------------------------------------[]
@@ -357,3 +367,61 @@ bool test_stl_allocator2(void) {
   return true;
 #endif
 }
+
+bool test_stl_heap_allocator1(void) {
+#ifdef __cplusplus
+  std::vector<some_struct, mi_heap_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_heap_allocator2(void) {
+#ifdef __cplusplus
+  std::vector<some_struct, mi_heap_destroy_stl_allocator<some_struct> > vec;
+  vec.push_back(some_struct());
+  vec.pop_back();
+  return vec.size() == 0;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_heap_allocator3(void) {
+#ifdef __cplusplus
+	mi_heap_t* heap = mi_heap_new();
+	bool good = false;
+	{
+		mi_heap_stl_allocator<some_struct> myAlloc(heap);
+		std::vector<some_struct, mi_heap_stl_allocator<some_struct> > vec(myAlloc);
+		vec.push_back(some_struct());
+		vec.pop_back();
+		good = vec.size() == 0;
+	}
+	mi_heap_delete(heap);
+  return good;
+#else
+  return true;
+#endif
+}
+
+bool test_stl_heap_allocator4(void) {
+#ifdef __cplusplus
+	mi_heap_t* heap = mi_heap_new();
+	bool good = false;
+	{
+		mi_heap_destroy_stl_allocator<some_struct> myAlloc(heap);
+		std::vector<some_struct, mi_heap_destroy_stl_allocator<some_struct> > vec(myAlloc);
+		vec.push_back(some_struct());
+		vec.pop_back();
+		good = vec.size() == 0;
+	}
+	mi_heap_destroy(heap);
+  return good;
+#else
+  return true;
+#endif
+}

From a42707908fdaa4ff5fd09788d74ad70311e9bf85 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 06:33:40 -0700
Subject: [PATCH 057/119] fix typo, issue #866

---
 doc/mimalloc-doc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 4c23a5fa..01b13904 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -168,7 +168,7 @@ void* mi_expand(void* p, size_t newsize);
 /// @returns A pointer to a block of \a count * \a size bytes, or \a NULL
 /// if out of memory or if \a count * \a size overflows.
 ///
-/// If there is no overflow, it behaves exactly like `mi_malloc(p,count*size)`.
+/// If there is no overflow, it behaves exactly like `mi_malloc(count*size)`.
 /// @see mi_calloc()
 /// @see mi_zallocn()
 void* mi_mallocn(size_t count, size_t size);

From 18ebeb8a83386ade978b2d867fafbd3b39641826 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 08:33:09 -0700
Subject: [PATCH 058/119] fix (benign) race condition on the page flags
 has_aligned flag and refactor free-ing code (issue 865)

---
 src/alloc.c | 323 ++++++++++++++++++++++++++++------------------------
 1 file changed, 175 insertions(+), 148 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index 8a76d3d3..2576206f 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -26,7 +26,9 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+// Note: in release mode the (inlined) routine is about 7 instructions with a single test.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+{
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
@@ -61,43 +63,43 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
     }    
   }
 
-#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
   if (!zero && !mi_page_is_huge(page)) {
     memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
   }
-#elif (MI_SECURE!=0)
+  #elif (MI_SECURE!=0)
   if (!zero) { block->next = 0; } // don't leak internal data
-#endif
+  #endif
 
-#if (MI_STAT>0)
+  #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
   if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
     mi_heap_stat_increase(heap, normal, bsize);
     mi_heap_stat_counter_increase(heap, normal_count, 1);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     const size_t bin = _mi_bin(bsize);
     mi_heap_stat_increase(heap, normal_bins[bin], 1);
-#endif
+    #endif
   }
-#endif
+  #endif
 
-#if MI_PADDING // && !MI_TRACK_ENABLED
+  #if MI_PADDING // && !MI_TRACK_ENABLED
   mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
   ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  #if (MI_DEBUG>=2)
-  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
-  #endif
+    #if (MI_DEBUG>=2)
+    mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+    #endif
   mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
   padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
   padding->delta  = (uint32_t)(delta);
-  #if MI_PADDING_CHECK
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)padding - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
-    for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
-  }
+    #if MI_PADDING_CHECK
+    if (!mi_page_is_huge(page)) {
+      uint8_t* fill = (uint8_t*)padding - delta;
+      const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+      for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+    }
+    #endif
   #endif
-#endif
 
   return block;
 }
@@ -112,9 +114,11 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   #if (MI_PADDING)
   if (size == 0) { size = sizeof(void*); }
   #endif
+  
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
   void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);  
   mi_track_malloc(p,size,zero);
+
   #if MI_STAT>1
   if (p != NULL) {
     if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); }
@@ -403,115 +407,31 @@ static void mi_stat_huge_free(const mi_page_t* page) {
 // Free
 // ------------------------------------------------------
 
-// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
 {
-  // first see if the segment was abandoned and we can reclaim it
-  mi_segment_t* const segment = _mi_page_segment(page);
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
-  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
-  }
-  
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  // owning thread can free a block directly
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }  
-  
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-  memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  mi_stat_free(page, block);
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-
-  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+    _mi_page_retire(page);
   }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }  
 }
 
-// regular free
-static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
-{
-  // and push it on the free list
-  //const size_t bsize = mi_page_block_size(page);
-  if mi_likely(local) {
-    // owning thread can free a block directly
-    if mi_unlikely(mi_check_is_double_free(page, block)) return;
-    mi_check_padding(page, block);
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-    #endif
-    mi_block_set_next(page, block, page->local_free);
-    page->local_free = block;
-    page->used--;
-    if mi_unlikely(mi_page_all_free(page)) {
-      _mi_page_retire(page);
-    }
-    else if mi_unlikely(mi_page_is_in_full(page)) {
-      _mi_page_unfull(page);
-    }
-  }
-  else {
-    _mi_free_block_mt(page,block);
-  }
-}
-
-
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
@@ -520,17 +440,27 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
-
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+// free a local pointer
+static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-  _mi_free_block(page, is_local, block);
+  mi_free_block_local(page, block, true);
+}
+
+// free a pointer owned by another thread
+static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_free_block_mt(segment, page, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(segment,page,p);
+           else mi_free_generic_mt(segment,page,p);
 }
 
 // Get the segment data belonging to a pointer
-// This is just a single `and` in assembly but does further checks in debug mode
-// (and secure mode) if this was a valid pointer.
+// This is just a single `and` in release mode but does further checks in debug mode
+// (and secure mode) to see if this was a valid pointer.
 static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
 {
   MI_UNUSED(msg);
@@ -566,7 +496,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 }
 
 // Free a block
-// fast path written carefully to prevent spilling on the stack
+// Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
   if mi_unlikely(p == NULL) return;
@@ -574,31 +504,20 @@ void mi_free(void* p) mi_attr_noexcept
   const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
   mi_page_t* const    page    = _mi_segment_page_of(segment, p);
 
-  if mi_likely(is_local) {                       // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0)  // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-    {
+  if mi_likely(is_local) {                        // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+      // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      if mi_unlikely(mi_check_is_double_free(page, block)) return;
-      mi_check_padding(page, block);
-      mi_stat_free(page, block);
-      #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-      #endif
-      mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-      mi_block_set_next(page, block, page->local_free);
-      page->local_free = block;
-      if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
-        _mi_page_retire(page);
-      }
+      mi_free_block_local(page,block,false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      _mi_free_generic(segment, page, true, p);
+      mi_free_generic_local(segment, page, p);
     }
   }
   else {
     // not thread-local; use generic path
-    _mi_free_generic(segment, page, false, p);
+    mi_free_generic_mt(segment, page, p);
   }
 }
 
@@ -623,10 +542,118 @@ bool _mi_free_delayed_block(mi_block_t* block) {
   _mi_page_free_collect(page, false);
 
   // and free the block (possibly freeing the page as well since used is updated)
-  _mi_free_block(page, true, block);
+  mi_free_block_local(page, block, true);
   return true;
 }
 
+// ------------------------------------------------------
+// Multi-threaded Free (`_mt`)
+// ------------------------------------------------------
+
+// Push a block that is owned by another thread on its page-local thread free
+// list or it's heap delayed free list. Such blocks are later collected by
+// the owning thread in `_mi_free_delayed_block`.
+static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+{
+  // Try to put the block on either the page-local thread free list, 
+  // or the heap delayed free list (if this is the first non-local free in that page)
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+{
+  // first see if the segment was abandoned and if we can reclaim it into our thread
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE && 
+      #endif
+      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  
+  // adjust stats (after padding check and potential recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+ 
+  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  else {
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+    #endif
+  }
+  
+  // and finally free the actual block by pushing it on the owning heap
+  // thread_delayed free list (or heap delayed free list)
+  mi_free_block_delayed_mt(page,block);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
 // Bytes available in a block
 mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
   const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);

From 355f44f373e765f4eb2bbfc91ade0735525e3a31 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 17 Mar 2024 08:44:11 -0700
Subject: [PATCH 059/119] split free routines in a separate file

---
 src/alloc.c | 511 +--------------------------------------------------
 src/free.c  | 519 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 520 insertions(+), 510 deletions(-)
 create mode 100644 src/free.c

diff --git a/src/alloc.c b/src/alloc.c
index 2576206f..76d68d13 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -18,6 +18,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #define MI_IN_ALLOC_C
 #include "alloc-override.c"
+#include "free.c"
 #undef MI_IN_ALLOC_C
 
 // ------------------------------------------------------
@@ -194,516 +195,6 @@ mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept
 }
 
 
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
-
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
-  }
-  return false;
-}
-
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
-  }
-  return false;
-}
-
-#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
-
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  bool is_double_free = false;
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    is_double_free = mi_check_is_double_freex(page, block);
-  }
-  return is_double_free;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  return false;
-}
-#endif
-
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if MI_PADDING // && !MI_TRACK_ENABLED
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  *delta = padding->delta;
-  uint32_t canary = padding->canary;
-  uintptr_t keys[2];
-  keys[0] = page->keys[0];
-  keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-  return ok;
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  padding->delta = (uint32_t)new_delta;
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-}
-#else
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
-}
-#endif
-
-#if MI_PADDING && MI_PADDING_CHECK
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)block + bsize - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-    mi_track_mem_defined(fill, maxpad);
-    for (size_t i = 0; i < maxpad; i++) {
-      if (fill[i] != MI_DEBUG_PADDING) {
-        *wrong = bsize - delta + i;
-        ok = false;
-        break;
-      }
-    }
-    mi_track_mem_noaccess(fill, maxpad);
-  }
-  return ok;
-}
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-#else
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-}
-
-#endif
-
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
-  MI_UNUSED(block);
-#endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
-  }
-#if !MI_HUGE_PAGE_ABANDON
-  else {
-    const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
-  }
-#endif
-}
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page); MI_UNUSED(block);
-}
-#endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
-
-// ------------------------------------------------------
-// Free
-// ------------------------------------------------------
-
-// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-
-// regular free of a (thread local) block pointer
-// fast path written carefully to prevent spilling on the stack
-static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
-{
-  // owning thread can free a block directly
-  if mi_unlikely(mi_check_is_double_free(page, block)) return;
-  mi_check_padding(page, block);
-  mi_stat_free(page, block);
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
-  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
-  #endif
-  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-  mi_block_set_next(page, block, page->local_free);
-  page->local_free = block;
-  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
-    _mi_page_retire(page);
-  }
-  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
-    _mi_page_unfull(page);
-  }  
-}
-
-// Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
-  mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
-  return (mi_block_t*)((uintptr_t)p - adjust);
-}
-
-// free a local pointer
-static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true);
-}
-
-// free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
-  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_free_block_mt(segment, page, block);
-}
-
-// generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(segment,page,p);
-           else mi_free_generic_mt(segment,page,p);
-}
-
-// Get the segment data belonging to a pointer
-// This is just a single `and` in release mode but does further checks in debug mode
-// (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
-{
-  MI_UNUSED(msg);
-  mi_assert(p != NULL);
-
-#if (MI_DEBUG>0)
-  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
-    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
-
-#if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
-  }
-#endif
-#if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-#endif
-
-  return segment;
-}
-
-// Free a block
-// Fast path written carefully to prevent register spilling on the stack
-void mi_free(void* p) mi_attr_noexcept
-{
-  if mi_unlikely(p == NULL) return;
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
-
-  if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
-      // thread-local, aligned, and not a full page
-      mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page,block,false /* no need to check if the page is full */);
-    }
-    else {
-      // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(segment, page, p);
-    }
-  }
-  else {
-    // not thread-local; use generic path
-    mi_free_generic_mt(segment, page, p);
-  }
-}
-
-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
-
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, true);
-  return true;
-}
-
-// ------------------------------------------------------
-// Multi-threaded Free (`_mt`)
-// ------------------------------------------------------
-
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
-{
-  // Try to put the block on either the page-local thread free list, 
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
-{
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
-  {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_free(block);  // recursively free as now it will be a local free in our heap
-      return;
-    }
-  }
-
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-  
-  // adjust stats (after padding check and potential recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
- 
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
-
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
-  }
-  
-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
-}
-
-
-// ------------------------------------------------------
-// Usable size
-// ------------------------------------------------------
-
-// Bytes available in a block
-mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
-  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
-  const size_t size = mi_page_usable_size_of(page, block);
-  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
-  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
-  return (size - adjust);
-}
-
-static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  if (p == NULL) return 0;
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
-  if mi_likely(!mi_page_has_aligned(page)) {
-    const mi_block_t* block = (const mi_block_t*)p;
-    return mi_page_usable_size_of(page, block);
-  }
-  else {
-    // split out to separate routine for improved code generation
-    return mi_page_usable_aligned_size_of(segment, page, p);
-  }
-}
-
-mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  return _mi_usable_size(p, "mi_usable_size");
-}
-
-
-// ------------------------------------------------------
-// Allocation extensions
-// ------------------------------------------------------
-
-void mi_free_size(void* p, size_t size) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
-  mi_free(p);
-}
-
-void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free_size(p,size);
-}
-
-void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
-  MI_UNUSED_RELEASE(alignment);
-  mi_assert(((uintptr_t)p % alignment) == 0);
-  mi_free(p);
-}
-
 mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
   size_t total;
   if (mi_count_size_overflow(count,size,&total)) return NULL;
diff --git a/src/free.c b/src/free.c
new file mode 100644
index 00000000..7761cb6a
--- /dev/null
+++ b/src/free.c
@@ -0,0 +1,519 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+#endif
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
+  MI_UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+#if !MI_HUGE_PAGE_ABANDON
+  else {
+    const size_t bpsize = mi_page_block_size(page);
+    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, huge, bpsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, giant, bpsize);
+    }
+  }
+#endif
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, giant, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+
+// regular free of a (thread local) block pointer
+// fast path written carefully to prevent spilling on the stack
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
+{
+  // owning thread can free a block directly
+  if mi_unlikely(mi_check_is_double_free(page, block)) return;
+  mi_check_padding(page, block);
+  mi_stat_free(page, block);
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
+  memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+  #endif
+  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+  mi_block_set_next(page, block, page->local_free);
+  page->local_free = block;
+  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+    _mi_page_retire(page);
+  }
+  else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
+    _mi_page_unfull(page);
+  }  
+}
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+// free a local pointer
+static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_free_block_local(page, block, true);
+}
+
+// free a pointer owned by another thread
+static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_free_block_mt(segment, page, block);
+}
+
+// generic free (for runtime integration)
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(segment,page,p);
+           else mi_free_generic_mt(segment,page,p);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in release mode but does further checks in debug mode
+// (and secure mode) to see if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+{
+  MI_UNUSED(msg);
+  mi_assert(p != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(!mi_is_in_heap_region(p)) {
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  return segment;
+}
+
+// Free a block
+// Fast path written carefully to prevent register spilling on the stack
+void mi_free(void* p) mi_attr_noexcept
+{
+  if mi_unlikely(p == NULL) return;
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                        // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+      // thread-local, aligned, and not a full page
+      mi_block_t* const block = (mi_block_t*)p;
+      mi_free_block_local(page,block,false /* no need to check if the page is full */);
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      mi_free_generic_local(segment, page, p);
+    }
+  }
+  else {
+    // not thread-local; use generic path
+    mi_free_generic_mt(segment, page, p);
+  }
+}
+
+// return true if successful
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
+    return false;
+  }
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  mi_free_block_local(page, block, true);
+  return true;
+}
+
+// ------------------------------------------------------
+// Multi-threaded Free (`_mt`)
+// ------------------------------------------------------
+
+// Push a block that is owned by another thread on its page-local thread free
+// list or it's heap delayed free list. Such blocks are later collected by
+// the owning thread in `_mi_free_delayed_block`.
+static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+{
+  // Try to put the block on either the page-local thread free list, 
+  // or the heap delayed free list (if this is the first non-local free in that page)
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  // If this was the first non-local free, we need to push it on the heap delayed free list instead
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+        mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+{
+  // first see if the segment was abandoned and if we can reclaim it into our thread
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+      #if MI_HUGE_PAGE_ABANDON
+      segment->page_kind != MI_PAGE_HUGE && 
+      #endif
+      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+  {
+    // the segment is abandoned, try to reclaim it into our heap
+    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
+      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_free(block);  // recursively free as now it will be a local free in our heap
+      return;
+    }
+  }
+
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  
+  // adjust stats (after padding check and potential recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page,block));
+ 
+  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+
+  if (segment->page_kind == MI_PAGE_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+  else {
+    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+    #endif
+  }
+  
+  // and finally free the actual block by pushing it on the owning heap
+  // thread_delayed free list (or heap delayed free list)
+  mi_free_block_delayed_mt(page,block);
+}
+
+
+// ------------------------------------------------------
+// Usable size
+// ------------------------------------------------------
+
+// Bytes available in a block
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  if (p == NULL) return 0;
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_likely(!mi_page_has_aligned(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}

From cc809b0cd4b99a564b00224cb2e66e4d881f62cd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 01:40:03 -0700
Subject: [PATCH 060/119] take 16 bits from used field to create a fast unalign
 path

---
 include/mimalloc/internal.h |  2 +-
 include/mimalloc/types.h    | 24 +++++++++++++-----------
 src/alloc.c                 |  2 +-
 src/free.c                  | 17 +++++++++++------
 src/init.c                  | 18 ++++++++++--------
 src/page.c                  | 12 +++++++++++-
 6 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 96f3922e..72544c3d 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -202,7 +202,7 @@ void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 // "libc.c"
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 049e68e7..c624e5b4 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -273,7 +273,7 @@ typedef uintptr_t mi_thread_free_t;
 //    and 12 are still good for address calculation)
 // - To limit the structure size, the `xblock_size` is 32-bits only; for
 //   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
-// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
@@ -295,19 +295,21 @@ typedef struct mi_page_s {
   uint8_t               retire_expire:7;   // expiration count for retired blocks
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint32_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for quick block start finding for aligned pointers)
+  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for quick block start finding for aligned pointers)
+  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
-  #endif
+  #endif             
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
-
-  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+  
+  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 } mi_page_t;
 
 
@@ -386,8 +388,8 @@ typedef struct mi_segment_s {
   uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
 
   // layout like this to optimize access in `mi_free`
-  size_t                 page_shift;     // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
   mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
   mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
@@ -446,8 +448,6 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;
-  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
   _Atomic(mi_block_t*)  thread_delayed_free;
   mi_threadid_t         thread_id;                           // thread this heap belongs too
   mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
@@ -459,6 +459,8 @@ struct mi_heap_s {
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")  
 };
 
 
diff --git a/src/alloc.c b/src/alloc.c
index 76d68d13..3a38a226 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -37,8 +37,8 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
-  page->used++;
   page->free = mi_block_next(page, block);
+  page->used++;
   mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
   #if MI_DEBUG>3
   if (page->free_is_zero) {
diff --git a/src/free.c b/src/free.c
index 7761cb6a..d0fcf133 100644
--- a/src/free.c
+++ b/src/free.c
@@ -249,25 +249,30 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  const size_t adjust = (diff % mi_page_block_size(page));
+  const size_t diff = (mi_likely(page->block_offset_adj != 0) 
+                        ? (uint8_t*)p - (uint8_t*)page - 8*(page->block_offset_adj-1)
+                        : (uint8_t*)p - _mi_page_start(segment, page, NULL));
+                      
+  const size_t adjust = (mi_likely(page->block_size_shift != 0) 
+                          ? diff & (((size_t)1 << page->block_size_shift) - 1)
+                          : diff % mi_page_block_size(page));
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
 // free a local pointer
-static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_local(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   mi_free_block_local(page, block, true);
 }
 
 // free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
   if (is_local) mi_free_generic_local(segment,page,p);
            else mi_free_generic_mt(segment,page,p);
 }
@@ -469,7 +474,7 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // ------------------------------------------------------
 
 // Bytes available in a block
-mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
   const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
   const size_t size = mi_page_usable_size_of(page, block);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
diff --git a/src/init.c b/src/init.c
index 7ec6e01e..11471760 100644
--- a/src/init.c
+++ b/src/init.c
@@ -21,9 +21,11 @@ const mi_page_t _mi_page_empty = {
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
-  0,       // used
-  0,       // xblock_size
   NULL,    // local_free
+  0,       // used
+  0,       // block size shift
+  0,       // block offset adj
+  0,       // xblock_size
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
@@ -93,8 +95,6 @@ const mi_page_t _mi_page_empty = {
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // tid
   0,                // cookie
@@ -104,7 +104,9 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next
-  false
+  false,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
 
@@ -130,8 +132,6 @@ static mi_tld_t tld_main = {
 
 mi_heap_t _mi_heap_main = {
   &tld_main,
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
   0,                // initial cookie
@@ -141,7 +141,9 @@ mi_heap_t _mi_heap_main = {
   0,                // page count
   MI_BIN_FULL, 0,   // page retired min/max
   NULL,             // next heap
-  false             // can reclaim
+  false,            // can reclaim
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY
 };
 
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
diff --git a/src/page.c b/src/page.c
index 5fefc3b5..5930a430 100644
--- a/src/page.c
+++ b/src/page.c
@@ -660,7 +660,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_page_set_heap(page, heap);
   size_t page_size;
   const void*  page_start = _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
-  MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);
   page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
   mi_assert_internal(page_size / block_size < (1L<<16));
@@ -677,6 +676,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
   }
   #endif
+  if (_mi_is_power_of_two(block_size) && block_size > 0) {
+    page->block_size_shift = (uint32_t)(mi_ctz((uintptr_t)block_size));
+  }
+  const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
+  const ptrdiff_t start_adjust = start_offset % block_size;
+  if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
+    page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+  }
+  
   
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
@@ -690,6 +698,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
+  mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
+  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - 8*(page->block_offset_adj-1))) % block_size == 0);
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list

From d08b4219e9bdbc83c92cd50e1d54b24a939a2271 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 03:32:06 -0700
Subject: [PATCH 061/119] improve used decrement code gen

---
 src/free.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index d0fcf133..6cdac123 100644
--- a/src/free.c
+++ b/src/free.c
@@ -238,7 +238,9 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
-  if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+  const uint32_t used = page->used - 1;
+  page->used = used;
+  if mi_unlikely(used == 0) {   // generates better code than: --page->used == 0
     _mi_page_retire(page);
   }
   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {

From 34d37fa2048f3bd65d017bf4f295b3b5b97ae98e Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 18 Mar 2024 03:47:54 -0700
Subject: [PATCH 062/119] nicer organisation of free.c

---
 src/free.c | 431 +++++++++++++++++++++++++++--------------------------
 1 file changed, 222 insertions(+), 209 deletions(-)

diff --git a/src/free.c b/src/free.c
index 6cdac123..e7547aa3 100644
--- a/src/free.c
+++ b/src/free.c
@@ -8,214 +8,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
 #endif
 
-// ------------------------------------------------------
-// Check for double free in secure and debug mode
-// This is somewhat expensive so only enabled for secure mode 4
-// ------------------------------------------------------
+// forward declarations
+static void   mi_check_padding(const mi_page_t* page, const mi_block_t* block);
+static bool   mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block);
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block);
+static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 
-#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
-// linear check if the free list contains a specific element
-static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
-  while (list != NULL) {
-    if (elem==list) return true;
-    list = mi_block_next(page, list);
-  }
-  return false;
-}
-
-static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
-  // The decoded value is in the same page (or NULL).
-  // Walk the free lists to verify positively if it is already freed
-  if (mi_list_contains(page, page->free, block) ||
-      mi_list_contains(page, page->local_free, block) ||
-      mi_list_contains(page, mi_page_thread_free(page), block))
-  {
-    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
-    return true;
-  }
-  return false;
-}
-
-#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
-
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  bool is_double_free = false;
-  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
-  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
-      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
-  {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
-    // (continue in separate function to improve code generation)
-    is_double_free = mi_check_is_double_freex(page, block);
-  }
-  return is_double_free;
-}
-#else
-static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  return false;
-}
-#endif
-
-// ---------------------------------------------------------------------------
-// Check for heap block overflow by setting up padding at the end of the block
-// ---------------------------------------------------------------------------
-
-#if MI_PADDING // && !MI_TRACK_ENABLED
-static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
-  *bsize = mi_page_usable_block_size(page);
-  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  *delta = padding->delta;
-  uint32_t canary = padding->canary;
-  uintptr_t keys[2];
-  keys[0] = page->keys[0];
-  keys[1] = page->keys[1];
-  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-  return ok;
-}
-
-// Return the exact usable size of a block.
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
-  return (ok ? bsize - delta : 0);
-}
-
-// When a non-thread-local block is freed, it becomes part of the thread delayed free
-// list that is freed later by the owning heap. If the exact usable size is too small to
-// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
-// so it will later not trigger an overflow error in `mi_free_block`.
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  mi_assert_internal(ok);
-  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
-  mi_assert_internal(bsize >= min_size);
-  if (bsize < min_size) return;  // should never happen
-  size_t new_delta = (bsize - min_size);
-  mi_assert_internal(new_delta < bsize);
-  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));
-  padding->delta = (uint32_t)new_delta;
-  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
-}
-#else
-static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(block);
-  return mi_page_usable_block_size(page);
-}
-
-void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-  MI_UNUSED(min_size);
-}
-#endif
-
-#if MI_PADDING && MI_PADDING_CHECK
-
-static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
-  size_t bsize;
-  size_t delta;
-  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
-  *size = *wrong = bsize;
-  if (!ok) return false;
-  mi_assert_internal(bsize >= delta);
-  *size = bsize - delta;
-  if (!mi_page_is_huge(page)) {
-    uint8_t* fill = (uint8_t*)block + bsize - delta;
-    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
-    mi_track_mem_defined(fill, maxpad);
-    for (size_t i = 0; i < maxpad; i++) {
-      if (fill[i] != MI_DEBUG_PADDING) {
-        *wrong = bsize - delta + i;
-        ok = false;
-        break;
-      }
-    }
-    mi_track_mem_noaccess(fill, maxpad);
-  }
-  return ok;
-}
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  size_t size;
-  size_t wrong;
-  if (!mi_verify_padding(page,block,&size,&wrong)) {
-    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
-  }
-}
-
-#else
-
-static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page);
-  MI_UNUSED(block);
-}
-
-#endif
-
-// only maintain stats for smaller objects if requested
-#if (MI_STAT>0)
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
-  MI_UNUSED(block);
-#endif
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
-  const size_t usize = mi_page_usable_size_of(page, block);
-  mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
-    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
-  }
-#if !MI_HUGE_PAGE_ABANDON
-  else {
-    const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
-  }
-#endif
-}
-#else
-static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-  MI_UNUSED(page); MI_UNUSED(block);
-}
-#endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
 
 // ------------------------------------------------------
 // Free
@@ -417,8 +215,12 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
   }
 }
 
+#if MI_HUGE_PAGE_ABANDON
+static void mi_stat_huge_free(const mi_page_t* page);
+#endif    
+
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
   if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
@@ -504,7 +306,7 @@ mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
 
 
 // ------------------------------------------------------
-// Allocation extensions
+// Free variants
 // ------------------------------------------------------
 
 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
@@ -524,3 +326,214 @@ void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
   mi_assert(((uintptr_t)p % alignment) == 0);
   mi_free(p);
 }
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if MI_PADDING // && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  padding->delta = (uint32_t)new_delta;
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+}
+#else
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+#if MI_PADDING && MI_PADDING_CHECK
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+        *wrong = bsize - delta + i;
+        ok = false;
+        break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+#else
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)
+  MI_UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+#if !MI_HUGE_PAGE_ABANDON
+  else {
+    const size_t bpsize = mi_page_block_size(page);
+    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, huge, bpsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, giant, bpsize);
+    }
+  }
+#endif
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, giant, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif

From 9085596eab602d868129ec44b62a8f6ea7c40d16 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 23 Mar 2024 08:57:29 -0700
Subject: [PATCH 063/119] update comment

---
 include/mimalloc/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index c624e5b4..7ab0a325 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -297,8 +297,8 @@ typedef struct mi_page_s {
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for quick block start finding for aligned pointers)
-  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for quick block start finding for aligned pointers)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
   
   #if (MI_ENCODE_FREELIST || MI_PADDING)

From 60c4a0fe5608603c2a7c0d45eaa1c73ca830c275 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 08:10:35 -0700
Subject: [PATCH 064/119] fix compilation warnings for new uint16_t size for
 used field

---
 include/mimalloc/internal.h | 10 ++---
 include/mimalloc/types.h    | 12 +++---
 src/free.c                  | 75 ++++++++++++++++++++++---------------
 src/init.c                  | 16 ++++----
 src/page.c                  |  8 ++--
 5 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 72544c3d..3aad1ba4 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline        __declspec(noinline)
 #define mi_decl_thread          __declspec(thread)
 #define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
-#define mi_decl_weak            
+#define mi_decl_weak
 #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
 #define mi_decl_noinline        __attribute__((noinline))
 #define mi_decl_thread          __thread
@@ -40,7 +40,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
 #define mi_decl_cache_align
-#define mi_decl_weak           
+#define mi_decl_weak
 #endif
 
 #if defined(__EMSCRIPTEN__) && !defined(__wasi__)
@@ -91,7 +91,7 @@ void       _mi_thread_data_collect(void);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
-void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);  
+void*      _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats);
 void       _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats);
 void       _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats);
 
@@ -132,8 +132,8 @@ void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 size_t     _mi_arena_segment_abandoned_count(void);
 
 typedef struct mi_arena_field_cursor_s { // abstract
-  mi_arena_id_t  start;   
-  int            count;   
+  mi_arena_id_t  start;
+  int            count;
   size_t         bitmap_idx;
 } mi_arena_field_cursor_t;
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 7ab0a325..ad0aabe9 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -300,14 +300,14 @@ typedef struct mi_page_s {
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
-  
+
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
-  #endif             
+  #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
   _Atomic(uintptr_t)        xheap;
-  
+
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 } mi_page_t;
@@ -373,7 +373,7 @@ typedef struct mi_segment_s {
   bool                 allow_decommit;
   bool                 allow_purge;
   size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  
+
   // segment fields
   struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
   struct mi_segment_s* prev;
@@ -450,7 +450,7 @@ struct mi_heap_s {
   mi_tld_t*             tld;
   _Atomic(mi_block_t*)  thread_delayed_free;
   mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)  
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -460,7 +460,7 @@ struct mi_heap_s {
   mi_heap_t*            next;                                // list of heaps per thread
   bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")  
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
 };
 
 
diff --git a/src/free.c b/src/free.c
index e7547aa3..4e031028 100644
--- a/src/free.c
+++ b/src/free.c
@@ -6,6 +6,11 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 #if !defined(MI_IN_ALLOC_C)
 #error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)"
+// add includes help an IDE
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"   // _mi_prim_thread_id()
 #endif
 
 // forward declarations
@@ -26,7 +31,7 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // fast path written carefully to prevent spilling on the stack
 static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
 {
-  // owning thread can free a block directly
+  // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
   mi_stat_free(page, block);
@@ -34,47 +39,57 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
   mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
+
+  // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
-  const uint32_t used = page->used - 1;
-  page->used = used;
-  if mi_unlikely(used == 0) {   // generates better code than: --page->used == 0
+  if mi_unlikely(--page->used == 0) {
     _mi_page_retire(page);
   }
   else if mi_unlikely(check_full && mi_page_is_in_full(page)) {
     _mi_page_unfull(page);
-  }  
+  }
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
-  const size_t diff = (mi_likely(page->block_offset_adj != 0) 
-                        ? (uint8_t*)p - (uint8_t*)page - 8*(page->block_offset_adj-1)
-                        : (uint8_t*)p - _mi_page_start(segment, page, NULL));
-                      
-  const size_t adjust = (mi_likely(page->block_size_shift != 0) 
-                          ? diff & (((size_t)1 << page->block_size_shift) - 1)
-                          : diff % mi_page_block_size(page));
+
+  size_t diff;
+  if mi_likely(page->block_offset_adj != 0) {
+    diff = (uint8_t*)p - (uint8_t*)page - 8 * (page->block_offset_adj - 1);
+  }
+  else {
+    diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  }
+
+  size_t adjust;
+  if mi_likely(page->block_size_shift != 0) {
+    adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
+  }
+  else {
+    adjust = diff % mi_page_block_size(page);
+  }
+
   return (mi_block_t*)((uintptr_t)p - adjust);
 }
 
-// free a local pointer
-static void mi_decl_noinline mi_free_generic_local(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+// free a local pointer  (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
   mi_free_block_local(page, block, true);
 }
 
-// free a pointer owned by another thread
-static void mi_decl_noinline mi_free_generic_mt(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept {
+// free a pointer owned by another thread (page parameter comes first for better codegen)
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
 // generic free (for runtime integration)
 void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(segment,page,p);
-           else mi_free_generic_mt(segment,page,p);
+  if (is_local) mi_free_generic_local(page,segment,p);
+           else mi_free_generic_mt(page,segment,p);
 }
 
 // Get the segment data belonging to a pointer
@@ -127,16 +142,16 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page,block,false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(segment, page, p);
+      mi_free_generic_local(page, segment, p);
     }
   }
   else {
     // not thread-local; use generic path
-    mi_free_generic_mt(segment, page, p);
+    mi_free_generic_mt(page, segment, p);
   }
 }
 
@@ -174,7 +189,7 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 // the owning thread in `_mi_free_delayed_block`.
 static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
 {
-  // Try to put the block on either the page-local thread free list, 
+  // Try to put the block on either the page-local thread free list,
   // or the heap delayed free list (if this is the first non-local free in that page)
   mi_thread_free_t tfreex;
   bool use_delayed;
@@ -217,17 +232,17 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 
 #if MI_HUGE_PAGE_ABANDON
 static void mi_stat_huge_free(const mi_page_t* page);
-#endif    
+#endif
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) && 
+  if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
       #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE && 
+      segment->page_kind != MI_PAGE_HUGE &&
       #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0) 
+      mi_atomic_load_relaxed(&segment->thread_id) == 0)
   {
     // the segment is abandoned, try to reclaim it into our heap
     if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
@@ -240,13 +255,13 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
   // The padding check may access the non-thread-owned page for the key values.
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
-  
+
   // adjust stats (after padding check and potential recursive `mi_free` above)
   mi_stat_free(page, block);    // stat_free may access the padding
   mi_track_free_size(block, mi_page_usable_size_of(page,block));
- 
+
   // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));       
+  _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
   if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
@@ -266,7 +281,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
     memset(block, MI_DEBUG_FREED, mi_usable_size(block));
     #endif
   }
-  
+
   // and finally free the actual block by pushing it on the owning heap
   // thread_delayed free list (or heap delayed free list)
   mi_free_block_delayed_mt(page,block);
diff --git a/src/init.c b/src/init.c
index 11471760..8a20daca 100644
--- a/src/init.c
+++ b/src/init.c
@@ -224,7 +224,7 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
       is_zero = memid.initially_zero;
     }
   }
-  
+
   if (td != NULL && !is_zero) {
     _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid));
   }
@@ -399,23 +399,23 @@ void mi_thread_done(void) mi_attr_noexcept {
   _mi_thread_done(NULL);
 }
 
-void _mi_thread_done(mi_heap_t* heap) 
+void _mi_thread_done(mi_heap_t* heap)
 {
   // calling with NULL implies using the default heap
-  if (heap == NULL) { 
-    heap = mi_prim_get_default_heap(); 
+  if (heap == NULL) {
+    heap = mi_prim_get_default_heap();
     if (heap == NULL) return;
   }
 
   // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
   if (!mi_heap_is_initialized(heap)) {
-    return; 
+    return;
   }
 
   // adjust stats
   mi_atomic_decrement_relaxed(&thread_count);
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
-  
+
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
 
@@ -437,7 +437,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
 
   // ensure the default heap is passed to `_mi_thread_done`
   // setting to a non-NULL value also ensures `mi_thread_done` is called.
-  _mi_prim_thread_associate_default_heap(heap);    
+  _mi_prim_thread_associate_default_heap(heap);
 }
 
 
@@ -597,7 +597,7 @@ static void mi_cdecl mi_process_done(void) {
 
   // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread
   _mi_prim_thread_done_auto_done();
-  
+
   #ifndef MI_SKIP_COLLECT_ON_EXIT
     #if (MI_DEBUG || !defined(MI_SHARED_LIB))
     // free all memory if possible on process exit. This is not needed for a stand-alone process
diff --git a/src/page.c b/src/page.c
index 5930a430..8721a063 100644
--- a/src/page.c
+++ b/src/page.c
@@ -192,8 +192,8 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   if (head == NULL) return;
 
   // find the tail -- also to get a proper count (without data races)
-  uint32_t max_count = page->capacity; // cannot collect more than capacity
-  uint32_t count = 1;
+  size_t max_count = page->capacity; // cannot collect more than capacity
+  size_t count = 1;
   mi_block_t* tail = head;
   mi_block_t* next;
   while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
@@ -211,7 +211,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page)
   page->local_free = head;
 
   // update counts now
-  page->used -= count;
+  page->used -= (uint16_t)count;
 }
 
 void _mi_page_free_collect(mi_page_t* page, bool force) {
@@ -677,7 +677,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   }
   #endif
   if (_mi_is_power_of_two(block_size) && block_size > 0) {
-    page->block_size_shift = (uint32_t)(mi_ctz((uintptr_t)block_size));
+    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
   const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
   const ptrdiff_t start_adjust = start_offset % block_size;

From 4f809aadb7663d67758db84c12d2fcb8b877b46b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 08:29:56 -0700
Subject: [PATCH 065/119] use free field for expiration instead of used

---
 src/free.c    |  2 +-
 src/page.c    | 23 ++++++++++----------
 src/segment.c | 59 +++++++++++++++++++++++++++++++--------------------
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/src/free.c b/src/free.c
index 4e031028..9579eecb 100644
--- a/src/free.c
+++ b/src/free.c
@@ -57,7 +57,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
   size_t diff;
   if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - 8 * (page->block_offset_adj - 1);
+    diff = (uint8_t*)p - (uint8_t*)page - (8*(page->block_offset_adj - 1));
   }
   else {
     diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
diff --git a/src/page.c b/src/page.c
index 8721a063..d9e416b2 100644
--- a/src/page.c
+++ b/src/page.c
@@ -261,7 +261,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  
+
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
@@ -676,16 +676,17 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
     mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  if (_mi_is_power_of_two(block_size) && block_size > 0) {
+  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
-  const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
-  const ptrdiff_t start_adjust = start_offset % block_size;
-  if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
-    page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+  if (block_size > 0) {
+    const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
+    const ptrdiff_t start_adjust = start_offset % block_size;
+    if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
+      page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+    }
   }
-  
-  
+
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -723,7 +724,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
   while (page != NULL)
   {
     mi_page_t* next = page->next; // remember next
-    #if MI_STAT    
+    #if MI_STAT
     count++;
     #endif
 
@@ -880,7 +881,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   else {
     // otherwise find a page with free blocks in our size segregated queues
     #if MI_PADDING
-    mi_assert_internal(size >= MI_PADDING_SIZE); 
+    mi_assert_internal(size >= MI_PADDING_SIZE);
     #endif
     return mi_find_free_page(heap, size);
   }
@@ -896,7 +897,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
 
   // initialize if necessary
   if mi_unlikely(!mi_heap_is_initialized(heap)) {
-    heap = mi_heap_get_default(); // calls mi_thread_init 
+    heap = mi_heap_get_default(); // calls mi_thread_init
     if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
   }
   mi_assert_internal(mi_heap_is_initialized(heap));
diff --git a/src/segment.c b/src/segment.c
index a6522028..7d406a96 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -237,12 +237,12 @@ static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tl
   mi_assert_internal(!page->segment_in_use);
   if (!segment->allow_purge) return;
   mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->free == NULL);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   size_t psize;
   void* start = mi_segment_raw_page_start(segment, page, &psize);
   const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
   if (needs_recommit) { page->is_committed = false; }
-  page->used = 0;
 }
 
 static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
@@ -258,6 +258,7 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   if (!ok) return false; // failed to commit!
   page->is_committed = true;
   page->used = 0;
+  page->free = NULL;
   page->is_zero_init = is_zero;
   if (gsize > 0) {
     mi_segment_protect_range(start + psize, gsize, true);
@@ -270,18 +271,30 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   The free page queue
 ----------------------------------------------------------- */
 
-// we re-use the `used` field for the expiration counter. Since this is a
-// a 32-bit field while the clock is always 64-bit we need to guard
-// against overflow, we use substraction to check for expiry which work
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which works
 // as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
-static void mi_page_purge_set_expire(mi_page_t* page) {
-  mi_assert_internal(page->used == 0);
-  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
-  page->used = expire;
+static uint32_t mi_page_get_expire( mi_page_t* page ) {
+  return (uint32_t)((uintptr_t)page->free);
 }
 
+static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) {
+  page->free = (mi_block_t*)((uintptr_t)expire);
+}
+
+static void mi_page_purge_set_expire(mi_page_t* page) {
+  mi_assert_internal(mi_page_get_expire(page)==0);
+  uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay);
+  mi_page_set_expire(page, expire);
+}
+
+// we re-use the `free` field for the expiration counter. Since this is a
+// a pointer size field while the clock is always 64-bit we need to guard
+// against overflow, we use substraction to check for expiry which work
+// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days)
 static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) {
-  int32_t expire = (int32_t)(page->used);
+  int32_t expire = (int32_t)mi_page_get_expire(page);
   return (((int32_t)now - expire) >= 0);
 }
 
@@ -320,14 +333,14 @@ static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) {
   mi_page_queue_t* pq = &tld->pages_purge;
   mi_assert_internal(pq!=NULL);
   mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(page->used != 0);
+  mi_assert_internal(mi_page_get_expire(page) != 0);
   mi_assert_internal(mi_pages_purge_contains(page, tld));
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
   if (page == pq->last)  pq->last = page->prev;
   if (page == pq->first) pq->first = page->next;
   page->next = page->prev = NULL;
-  page->used = 0;
+  mi_page_set_expire(page,0);
 }
 
 static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) {
@@ -493,7 +506,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   }
   MI_UNUSED(fully_committed);
   mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-  
+
   _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
@@ -592,7 +605,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   if (segment == NULL) return NULL;
   mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
   mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true);
-  
+
   // zero the segment info (but not the `mem` fields)
   ptrdiff_t ofs = offsetof(mi_segment_t, next);
   _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
@@ -746,21 +759,21 @@ Abandonment
 When threads terminate, they can leave segments with
 live blocks (reached through other threads). Such segments
 are "abandoned" and will be reclaimed by other threads to
-reuse their pages and/or free them eventually. The 
+reuse their pages and/or free them eventually. The
 `thread_id` of such segments is 0.
 
 When a block is freed in an abandoned segment, the segment
-is reclaimed into that thread. 
+is reclaimed into that thread.
 
 Moreover, if threads are looking for a fresh segment, they
 will first consider abondoned segments -- these can be found
-by scanning the arena memory 
-(segments outside arena memoryare only reclaimed by a free). 
+by scanning the arena memory
+(segments outside arena memoryare only reclaimed by a free).
 ----------------------------------------------------------- */
 
 // legacy: Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
-  // nothing needed 
+  // nothing needed
 }
 
 /* -----------------------------------------------------------
@@ -914,12 +927,12 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
-  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned  
+  if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
   // don't reclaim more from a free than half the current segments
   // this is to prevent a pure free-ing thread to start owning too many segments
-  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;  
+  if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
   if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
-    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);    
+    mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments);
     mi_assert_internal(res == segment);
     return (res != NULL);
   }
@@ -946,11 +959,11 @@ static long mi_segment_get_reclaim_tries(void) {
 
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
-  *reclaimed = false;  
+  *reclaimed = false;
   mi_segment_t* segment;
   mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
   long max_tries = mi_segment_get_reclaim_tries();
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL)) 
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
     segment->abandoned_visits++;
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments

From ea6137a5017a407ffedafd2757ee6d4a840668fc Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 09:01:58 -0700
Subject: [PATCH 066/119] use MI_MAX_ALIGN_SIZE to adjust block_offset_adj

---
 include/mimalloc/types.h | 4 ++--
 src/page.c               | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ad0aabe9..5bc49aa0 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -297,8 +297,8 @@ typedef struct mi_page_s {
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift == block_size)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               block_offset_adj;  // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t               block_offset_adj;  // if not zero, then `(mi_page_start(_,page,_) - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(block_offset_adj-1)) % block_size == 0)` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint32_t              xblock_size;       // size available in each block (always `>0`)
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
diff --git a/src/page.c b/src/page.c
index d9e416b2..912f969a 100644
--- a/src/page.c
+++ b/src/page.c
@@ -682,8 +682,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   if (block_size > 0) {
     const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
     const ptrdiff_t start_adjust = start_offset % block_size;
-    if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) {
-      page->block_offset_adj = (uint8_t)((start_adjust/8) + 1);
+    if (start_offset >= 0 && (start_adjust % MI_MAX_ALIGN_SIZE) == 0 && (start_adjust / MI_MAX_ALIGN_SIZE) < 255) {
+      const ptrdiff_t adjust = (start_adjust / MI_MAX_ALIGN_SIZE);
+      mi_assert_internal(adjust + 1 == (uint8_t)(adjust + 1));
+      page->block_offset_adj = (uint8_t)(adjust + 1);
     }
   }
 
@@ -700,7 +702,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[1] != 0);
   #endif
   mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
-  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - 8*(page->block_offset_adj-1))) % block_size == 0);
+  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(page->block_offset_adj-1))) % block_size == 0);
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list

From 86475a7b9bbd5fdd756cd89a579c3d8368708e2f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 09:03:01 -0700
Subject: [PATCH 067/119] use MI_MAX_ALIGN_SIZE to adjust block_offset_adj

---
 src/free.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index 9579eecb..7a5a7806 100644
--- a/src/free.c
+++ b/src/free.c
@@ -57,7 +57,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
   size_t diff;
   if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - (8*(page->block_offset_adj - 1));
+    diff = (uint8_t*)p - (uint8_t*)page - (MI_MAX_ALIGN_SIZE*(page->block_offset_adj - 1));
   }
   else {
     diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);

From 07ae64bd81bc411d740e93f8f98551150e769d1f Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:39:22 -0700
Subject: [PATCH 068/119] faster check for NULL in mi_free by combining with
 masking

---
 CMakeLists.txt              | 21 ++++++++++++++++++---
 include/mimalloc/internal.h | 11 +++++++++--
 src/free.c                  | 13 +++++++------
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cc7e575..7c0f67af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,17 @@ endif()
 # Process options
 # -----------------------------------------------------------------------------
 
+# put -Wall early so other warnings can be disabled selectively
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+  list(APPEND mi_cflags -Wall -Wextra -Wpedantic)
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+    list(APPEND mi_cflags -Wall -Wextra)
+endif()
+if(CMAKE_C_COMPILER_ID MATCHES "Intel")
+    list(APPEND mi_cflags -Wall)
+endif()
+
 if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
   set(MI_USE_CXX "ON")
 endif()
@@ -186,6 +197,10 @@ endif()
 if(MI_SEE_ASM)
   message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
   list(APPEND mi_cflags -save-temps)
+  if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
+    message(STATUS "No GNU Line marker")
+    list(APPEND mi_cflags -Wno-gnu-line-marker)
+  endif()
 endif()
 
 if(MI_CHECK_FULL)
@@ -279,17 +294,17 @@ endif()
 
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
-  list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden)
+  list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden)
   if(NOT MI_USE_CXX)
     list(APPEND mi_cflags -Wstrict-prototypes)
   endif()
   if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang")
-    list(APPEND mi_cflags -Wpedantic -Wno-static-in-inline)
+    list(APPEND mi_cflags -Wno-static-in-inline)
   endif()
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "Intel")
-  list(APPEND mi_cflags -Wall -fvisibility=hidden)
+  list(APPEND mi_cflags -fvisibility=hidden)
 endif()
 
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 3aad1ba4..02200594 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -416,13 +416,19 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
 // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
 // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
 // therefore we align one byte before `p`.
+// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
 static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_assert_internal(p != NULL);
-  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+  #if MI_INTPTR_SIZE <= 4
+  return (p==NULL ? NULL : segment);
+  #else
+  return ((intptr_t)segment <= 0 ? NULL : segment);
+  #endif
 }
 
 // Segment belonging to a page
 static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_assert_internal(page!=NULL);
   mi_segment_t* segment = _mi_ptr_segment(page);
   mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
   return segment;
@@ -454,6 +460,7 @@ static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page
 
 // Get the page containing the pointer
 static inline mi_page_t* _mi_ptr_page(void* p) {
+  mi_assert_internal(p!=NULL);
   return _mi_segment_page_of(_mi_ptr_segment(p), p);
 }
 
diff --git a/src/free.c b/src/free.c
index 7a5a7806..0e560e53 100644
--- a/src/free.c
+++ b/src/free.c
@@ -98,7 +98,6 @@ void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, b
 static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
 {
   MI_UNUSED(msg);
-  mi_assert(p != NULL);
 
 #if (MI_DEBUG>0)
   if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
@@ -108,7 +107,7 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 #endif
 
   mi_segment_t* const segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
+  if mi_unlikely(segment==NULL) return segment;
 
 #if (MI_DEBUG>0)
   if mi_unlikely(!mi_is_in_heap_region(p)) {
@@ -133,10 +132,11 @@ static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* ms
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  if mi_unlikely(p == NULL) return;
   mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  const bool          is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+  if mi_unlikely(segment==NULL) return;
+
+  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
 
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
@@ -158,6 +158,7 @@ void mi_free(void* p) mi_attr_noexcept
 // return true if successful
 bool _mi_free_delayed_block(mi_block_t* block) {
   // get segment and page
+  mi_assert_internal(block!=NULL);
   const mi_segment_t* const segment = _mi_ptr_segment(block);
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(_mi_thread_id() == segment->thread_id);
@@ -302,8 +303,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  if (p == NULL) return 0;
   const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  if mi_unlikely(segment==NULL) return 0;
   const mi_page_t* const page = _mi_segment_page_of(segment, p);
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;

From f141ca12a49da87b7740a9cbd67ebf14d31d145c Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:53:09 -0700
Subject: [PATCH 069/119] add extra runtime check to ensure we never insert
 large or huge pages in the segment free queue (issue #870)

---
 src/segment.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/segment.c b/src/segment.c
index 7d406a96..e7e7d2cc 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -746,8 +746,10 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
     }
     else if (segment->used + 1 == segment->capacity) {
       mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
-      // move back to segments  free list
-      mi_segment_insert_in_free_queue(segment,tld);
+      if (segment->page_kind <= MI_PAGE_MEDIUM) {
+        // move back to segments  free list
+        mi_segment_insert_in_free_queue(segment,tld);
+      }
     }
   }
 }

From 6688b45fbdd2b3091a852e6db37454a3a7897061 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 10:57:02 -0700
Subject: [PATCH 070/119] rename MI_ALIGNMENT_MAX to MI_BLOCK_ALIGNMENT_MAX for
 clarity

---
 doc/mimalloc-doc.h       |  4 ++--
 include/mimalloc/types.h |  4 ++--
 src/alloc-aligned.c      | 10 ++++-----
 src/os.c                 | 48 ++++++++++++++++++++--------------------
 src/page.c               |  2 +-
 src/segment.c            |  2 +-
 test/test-api.c          | 16 +++++++-------
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 01b13904..47a8a6b9 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -499,11 +499,11 @@ void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_m
 /// \{
 
 /// The maximum supported alignment size (currently 1MiB).
-#define MI_ALIGNMENT_MAX   (1024*1024UL)
+#define MI_BLOCK_ALIGNMENT_MAX   (1024*1024UL)
 
 /// Allocate \a size bytes aligned by \a alignment.
 /// @param size  number of bytes to allocate.
-/// @param alignment  the minimal alignment of the allocated memory. Must be less than #MI_ALIGNMENT_MAX.
+/// @param alignment  the minimal alignment of the allocated memory. Must be less than #MI_BLOCK_ALIGNMENT_MAX.
 /// @returns pointer to the allocated memory or \a NULL if out of memory.
 /// The returned pointer is aligned by \a alignment, i.e.
 /// `(uintptr_t)p % alignment == 0`.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 5bc49aa0..d088b305 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -193,8 +193,8 @@ typedef int32_t  mi_ssize_t;
 // Used as a special value to encode block sizes in 32 bits.
 #define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
 
-// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
 
 // ------------------------------------------------------
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 0907811e..5f60b2fc 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -33,7 +33,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 
   void* p;
   size_t oversize;
-  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {
+  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
     // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
     // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
     // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
@@ -47,7 +47,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
     oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
     p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
     // zero afterwards as only the area from the aligned_p may be committed!
-    if (p == NULL) return NULL;    
+    if (p == NULL) return NULL;
   }
   else {
     // otherwise over-allocate
@@ -73,9 +73,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
-    
+
   // now zero the block if needed
-  if (alignment > MI_ALIGNMENT_MAX) {
+  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
     // for the tracker, on huge aligned allocations only from the start of the large block is defined
     mi_track_mem_undefined(aligned_p, size);
     if (zero) {
@@ -85,7 +85,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
 
   if (p != aligned_p) {
     mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
-  }  
+  }
   return aligned_p;
 }
 
diff --git a/src/os.c b/src/os.c
index 21ab9243..09ae367d 100644
--- a/src/os.c
+++ b/src/os.c
@@ -29,7 +29,7 @@ bool _mi_os_has_overcommit(void) {
   return mi_os_mem_config.has_overcommit;
 }
 
-bool _mi_os_has_virtual_reserve(void) { 
+bool _mi_os_has_virtual_reserve(void) {
   return mi_os_mem_config.has_virtual_reserve;
 }
 
@@ -180,7 +180,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     }
   }
   else {
-    // nothing to do 
+    // nothing to do
     mi_assert(memid.memkind < MI_MEM_OS);
   }
 }
@@ -203,25 +203,25 @@ static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bo
   if (!commit) { allow_large = false; }
   if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning
   *is_zero = false;
-  void* p = NULL; 
+  void* p = NULL;
   int err = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large, is_zero, &p);
   if (err != 0) {
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, size, try_alignment, commit, allow_large);
   }
-  
+
   MI_UNUSED(tld_stats);
   mi_stats_t* stats = &_mi_stats_main;
   mi_stat_counter_increase(stats->mmap_calls, 1);
   if (p != NULL) {
     _mi_stat_increase(&stats->reserved, size);
-    if (commit) { 
-      _mi_stat_increase(&stats->committed, size); 
+    if (commit) {
+      _mi_stat_increase(&stats->committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
                else { mi_track_mem_undefined(p,size); }
       #endif
-    }    
+    }
   }
   return p;
 }
@@ -258,7 +258,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // over-allocate uncommitted (virtual) memory
       p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // set p to the aligned part in the full region
       // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
       // this is handled though by having the `base` field in the memid's
@@ -274,7 +274,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       // overallocate...
       p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats);
       if (p == NULL) return NULL;
-      
+
       // and selectively unmap parts around the over-allocated area. (noop on sbrk)
       void* aligned_p = mi_align_up_ptr(p, alignment);
       size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
@@ -285,7 +285,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); }
       // we can return the aligned pointer on `mmap` (and sbrk) systems
       p = aligned_p;
-      *base = aligned_p; // since we freed the pre part, `*base == p`.      
+      *base = aligned_p; // since we freed the pre part, `*base == p`.
     }
   }
 
@@ -307,7 +307,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats);
   if (p != NULL) {
     *memid = _mi_memid_create_os(true, os_is_zero, os_is_large);
-  }  
+  }
   return p;
 }
 
@@ -318,7 +318,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -333,7 +333,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
 
 /* -----------------------------------------------------------
   OS aligned allocation with an offset. This is used
-  for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc
+  for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc
   page where the object can be aligned at an offset from the start of the segment.
   As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
   to use the actual start of the memory region.
@@ -396,7 +396,7 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
   MI_UNUSED(tld_stats);
-  mi_stats_t* stats = &_mi_stats_main;  
+  mi_stats_t* stats = &_mi_stats_main;
   if (is_zero != NULL) { *is_zero = false; }
   _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
   _mi_stat_counter_increase(&stats->commit_calls, 1);
@@ -406,21 +406,21 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats
   void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize);
   if (csize == 0) return true;
 
-  // commit  
+  // commit
   bool os_is_zero = false;
-  int err = _mi_prim_commit(start, csize, &os_is_zero); 
+  int err = _mi_prim_commit(start, csize, &os_is_zero);
   if (err != 0) {
     _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
     return false;
   }
-  if (os_is_zero && is_zero != NULL) { 
+  if (os_is_zero && is_zero != NULL) {
     *is_zero = true;
     mi_assert_expensive(mi_mem_is_zero(start, csize));
   }
   // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails)
   #ifdef MI_TRACK_ASAN
   if (os_is_zero) { mi_track_mem_defined(start,csize); }
-             else { mi_track_mem_undefined(start,csize); } 
+             else { mi_track_mem_undefined(start,csize); }
   #endif
   return true;
 }
@@ -434,11 +434,11 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_
   // page align
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return true; 
+  if (csize == 0) return true;
 
   // decommit
   *needs_recommit = true;
-  int err = _mi_prim_decommit(start,csize,needs_recommit);  
+  int err = _mi_prim_decommit(start,csize,needs_recommit);
   if (err != 0) {
     _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize);
   }
@@ -456,7 +456,7 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { 
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
@@ -476,7 +476,7 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
 }
 
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
 {
@@ -489,7 +489,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   {
     bool needs_recommit = true;
     mi_os_decommit_ex(p, size, &needs_recommit, stats);
-    return needs_recommit;   
+    return needs_recommit;
   }
   else {
     if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
@@ -499,7 +499,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats)
   }
 }
 
-// either resets or decommits memory, returns true if the memory needs 
+// either resets or decommits memory, returns true if the memory needs
 // to be recommitted if it is to be re-used later on.
 bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) {
   return _mi_os_purge_ex(p, size, true, stats);
diff --git a/src/page.c b/src/page.c
index 912f969a..63780d63 100644
--- a/src/page.c
+++ b/src/page.c
@@ -831,7 +831,7 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
 // Because huge pages contain just one block, and the segment contains
 // just that page, we always treat them as abandoned and any thread
 // that frees the block can free the whole page and segment directly.
-// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX).
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX).
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
   mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
diff --git a/src/segment.c b/src/segment.c
index e7e7d2cc..359815ce 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1189,7 +1189,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
 
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
   mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) {
+  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
     mi_assert_internal(_mi_is_power_of_two(page_alignment));
     mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
     //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0);
diff --git a/test/test-api.c b/test/test-api.c
index 8dd24e1b..6dd2bc7f 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)
 
 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
 
 #include "testhelper.h"
 
@@ -59,7 +59,7 @@ bool mem_is_zero(uint8_t* p, size_t size) {
 // ---------------------------------------------------------------------------
 int main(void) {
   mi_option_disable(mi_option_verbose);
-  
+
   // ---------------------------------------------------
   // Malloc
   // ---------------------------------------------------
@@ -154,7 +154,7 @@ int main(void) {
   };
   CHECK_BODY("malloc-aligned6") {
     bool ok = true;
-    for (size_t align = 1; align <= MI_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
       void* ps[8];
       for (int i = 0; i < 8 && ok; i++) {
         ps[i] = mi_malloc_aligned(align*13  // size
@@ -170,16 +170,16 @@ int main(void) {
     result = ok;
   };
   CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
     mi_free(p);
-    result = ((uintptr_t)p % MI_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
   };
   CHECK_BODY("malloc-aligned8") {
     bool ok = true;
     for (int i = 0; i < 5 && ok; i++) {
       int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
+      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
       mi_free(p);
     }
     result = ok;
@@ -187,7 +187,7 @@ int main(void) {
   CHECK_BODY("malloc-aligned9") {
     bool ok = true;
     void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_ALIGNMENT_MAX, MI_ALIGNMENT_MAX + 1, 2 * MI_ALIGNMENT_MAX, 8 * MI_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
     for (int i = 0; i < 28 && ok; i++) {
       int align = (1 << i);
       for (int j = 0; j < 8 && ok; j++) {

From b5665f0eec417a73a8abcdb00f3a95165b165527 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:17:17 -0700
Subject: [PATCH 071/119] add full block_size and page_start to page info

---
 ide/vs2022/mimalloc.vcxproj         |  6 +++
 ide/vs2022/mimalloc.vcxproj.filters |  3 ++
 include/mimalloc/internal.h         | 21 +++-----
 include/mimalloc/types.h            | 31 +++++------
 src/alloc.c                         | 10 ++--
 src/free.c                          | 16 +-----
 src/heap.c                          | 14 ++---
 src/init.c                          |  9 ++--
 src/page-queue.c                    | 30 ++++++-----
 src/page.c                          | 83 ++++++++++++-----------------
 src/segment.c                       | 37 +++++++------
 src/stats.c                         |  7 +--
 12 files changed, 119 insertions(+), 148 deletions(-)

diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 11da11c3..3e11d0fe 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -217,6 +217,12 @@
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\libc.c" />
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index bb5c8ce9..a387f5a5 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -58,6 +58,9 @@
     <ClCompile Include="..\..\src\libc.c">
       <Filter>Sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\free.c">
+      <Filter>Sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitmap.h">
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 02200594..21dc9d62 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,7 +147,7 @@ void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size); // page start for any page
 
 #if MI_HUGE_PAGE_ABANDON
 void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@@ -452,10 +452,9 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 }
 
 // Quick page start for initialized pages
-static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
-  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert_internal(page->page_start != NULL);
+  return page->page_start;
 }
 
 // Get the page containing the pointer
@@ -466,16 +465,8 @@ static inline mi_page_t* _mi_ptr_page(void* p) {
 
 // Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
-  const size_t bsize = page->xblock_size;
-  mi_assert_internal(bsize > 0);
-  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
-    return bsize;
-  }
-  else {
-    size_t psize;
-    _mi_segment_page_start(_mi_page_segment(page), page, bsize, &psize, NULL);
-    return psize;
-  }
+  mi_assert_internal(page->block_size > 0);
+  return page->block_size;  
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d088b305..69d59527 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -181,7 +181,6 @@ typedef int32_t  mi_ssize_t;
 #define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
 #define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
 #define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
-#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
@@ -190,9 +189,6 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif
 
-// Used as a special value to encode block sizes in 32 bits.
-#define MI_HUGE_BLOCK_SIZE   ((uint32_t)MI_HUGE_OBJ_SIZE_MAX)
-
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
@@ -258,7 +254,6 @@ typedef uintptr_t mi_thread_free_t;
 // implement a monotonic heartbeat. The `thread_free` list is needed for
 // avoiding atomic operations in the common case.
 //
-//
 // `used - |thread_free|` == actual blocks that are in use (alive)
 // `used - |thread_free| + |free| + |local_free| == capacity`
 //
@@ -266,16 +261,13 @@ typedef uintptr_t mi_thread_free_t;
 // the number of memory accesses in the `mi_page_all_free` function(s).
 //
 // Notes:
-// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 8 words on 64-bit which helps the page index calculations
-//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
-//    and 12 are still good for address calculation)
-// - To limit the structure size, the `xblock_size` is 32-bits only; for
-//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - The size is 10 words on 64-bit which helps the page index calculations
+//   (and 14 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
-//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
 //   The invariant is that no-delayed-free is only set if there is
 //   at least one block that will be added, or as already been added, to
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
@@ -290,16 +282,16 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-
+                                           // padding
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               block_offset_adj;  // if not zero, then `(mi_page_start(_,page,_) - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(block_offset_adj-1)) % block_size == 0)` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  size_t                block_size;        // size available in each block (always `>0`)
+  uint8_t*              page_start;        // start of the page area containing the blocks
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
@@ -310,6 +302,10 @@ typedef struct mi_page_s {
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+
+  #if MI_INTPTR_SIZE==4                    // pad to 14 words on 32-bit
+  void* padding[1];
+  #endif
 } mi_page_t;
 
 
@@ -548,7 +544,6 @@ typedef struct mi_stats_s {
   mi_stat_counter_t searches;
   mi_stat_counter_t normal_count;
   mi_stat_counter_t huge_count;
-  mi_stat_counter_t giant_count;
   mi_stat_counter_t arena_count;
   mi_stat_counter_t arena_crossover_count;
   mi_stat_counter_t arena_rollback_count;
diff --git a/src/alloc.c b/src/alloc.c
index 3a38a226..8b6c4de0 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -30,7 +30,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
 extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
 {
-  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+  mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
   if mi_unlikely(block == NULL) {
     return _mi_malloc_generic(heap, size, zero, 0);
@@ -53,14 +53,14 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 
   // zero the block? note: we need to zero the full block size (issue #63)
   if mi_unlikely(zero) {
-    mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
-    mi_assert_internal(page->xblock_size >= MI_PADDING_SIZE);
+    mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    mi_assert_internal(page->block_size >= MI_PADDING_SIZE);
     if (page->free_is_zero) {
       block->next = 0;
-      mi_track_mem_defined(block, page->xblock_size - MI_PADDING_SIZE);
+      mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE);
     }
     else {
-      _mi_memzero_aligned(block, page->xblock_size - MI_PADDING_SIZE);
+      _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE);
     }    
   }
 
diff --git a/src/free.c b/src/free.c
index 0e560e53..c66de6f6 100644
--- a/src/free.c
+++ b/src/free.c
@@ -55,14 +55,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
-  size_t diff;
-  if mi_likely(page->block_offset_adj != 0) {
-    diff = (uint8_t*)p - (uint8_t*)page - (MI_MAX_ALIGN_SIZE*(page->block_offset_adj - 1));
-  }
-  else {
-    diff = (uint8_t*)p - _mi_page_start(segment, page, NULL);
-  }
-
+  size_t diff = (uint8_t*)p - page->page_start;
   size_t adjust;
   if mi_likely(page->block_size_shift != 0) {
     adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@@ -519,12 +512,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
 #if !MI_HUGE_PAGE_ABANDON
   else {
     const size_t bpsize = mi_page_block_size(page);
-    if (bpsize <= MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, huge, bpsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, giant, bpsize);
-    }
+    mi_heap_stat_decrease(heap, huge, bpsize);
   }
 #endif
 }
diff --git a/src/heap.c b/src/heap.c
index 18cfc706..21cdfa46 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -289,12 +289,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   // stats
   const size_t bsize = mi_page_block_size(page);
   if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_decrease(heap, giant, bsize);
-    }
-    else {
-      mi_heap_stat_decrease(heap, huge, bsize);
-    }
+    mi_heap_stat_decrease(heap, huge, bsize);
   }
 #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
@@ -467,8 +462,7 @@ static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   MI_UNUSED(heap);
   MI_UNUSED(pq);
   bool* found = (bool*)vfound;
-  mi_segment_t* segment = _mi_page_segment(page);
-  void* start = _mi_page_start(segment, page, NULL);
+  void* start = mi_page_start(page);
   void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
   *found = (p >= start && p < end);
   return (!*found); // continue if not found
@@ -514,7 +508,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   const size_t bsize = mi_page_block_size(page);
   const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
-  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
 
   if (page->capacity == 1) {
     // optimize page with one block
@@ -581,7 +575,7 @@ static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa
   xarea.page = page;
   xarea.area.reserved = page->reserved * bsize;
   xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.blocks = mi_page_start(page);
   xarea.area.used = page->used;   // number of blocks in use (#553)
   xarea.area.block_size = ubsize;
   xarea.area.full_block_size = bsize;
diff --git a/src/init.c b/src/init.c
index 8a20daca..79175f81 100644
--- a/src/init.c
+++ b/src/init.c
@@ -17,15 +17,15 @@ const mi_page_t _mi_page_empty = {
   0, false, false, false,
   0,       // capacity
   0,       // reserved capacity
+  0,       // used
   { 0 },   // flags
+  0,       // block size shift
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
   NULL,    // local_free
-  0,       // used
-  0,       // block size shift
-  0,       // block offset adj
-  0,       // xblock_size
+  0,       // block_size
+  NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
@@ -78,7 +78,6 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
   { 0, 0 }, { 0, 0 }, { 0, 0 } \
diff --git a/src/page-queue.c b/src/page-queue.c
index fbfe2811..095f9b81 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -11,6 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #ifndef MI_IN_PAGE_C
 #error "this file should be included from 'page.c'"
+// include to help an IDE
+#include "mimalloc.h"     
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
 #endif
 
 /* -----------------------------------------------------------
@@ -138,20 +142,20 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 #endif
 
 static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
   mi_heap_t* heap = mi_page_heap(page);
   mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
+  mi_assert_internal(bin >= MI_BIN_HUGE || mi_page_block_size(page) == pq->block_size);
   mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
 static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
   mi_assert_internal(bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  mi_assert_internal(mi_page_is_in_full(page) || mi_page_block_size(page) == pq->block_size);
   return pq;
 }
 
@@ -206,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
@@ -231,8 +235,8 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  mi_assert_internal(page->xblock_size == queue->block_size ||
-                      (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
+                      (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -258,11 +262,13 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(from, page));
   mi_assert_expensive(!mi_page_queue_contains(to, page));
-  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
-                     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
-                     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+  const size_t bsize = mi_page_block_size(page);
+  MI_UNUSED(bsize);
+  mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
+                     (bsize == to->block_size && mi_page_queue_is_full(from)) ||
+                     (bsize == from->block_size && mi_page_queue_is_full(to)) ||
+                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
diff --git a/src/page.c b/src/page.c
index 63780d63..ef58d883 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -78,14 +78,14 @@ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
 }
 
 static bool mi_page_is_valid_init(mi_page_t* page) {
-  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_block_size(page) > 0);
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
   const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
-  uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,bsize,NULL,NULL));
+  uint8_t* start = mi_page_start(page);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -283,10 +283,9 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
-  mi_assert_internal(pq!=NULL || page->xblock_size != 0);
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   // a fresh page was found, initialize it
-  const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
   mi_assert_internal(full_block_size >= block_size);
   mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
@@ -425,7 +424,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   _mi_segment_page_free(page, force, segments_tld);
 }
 
-#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX
+#define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
 #define MI_RETIRE_CYCLES      (16)
 
 // Retire a page with no more used blocks
@@ -448,10 +447,12 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // how to check this efficiently though...
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
-  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
+  const size_t bsize = mi_page_block_size(page);
+  if mi_likely(bsize < MI_MAX_RETIRE_SIZE) {  // not too large && not full or huge queue?
+    mi_assert_internal(!mi_page_queue_is_special(pq));
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -514,7 +515,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  void* const page_area = mi_page_start(page);
 
   // initialize a randomized free list
   // set up `slice_count` slices to alternate between
@@ -572,7 +573,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
   #endif
   mi_assert_internal(page->capacity + extend <= page->reserved);
   mi_assert_internal(bsize == mi_page_block_size(page));
-  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  void* const page_area = mi_page_start(page);
 
   mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
 
@@ -616,15 +617,15 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size, NULL);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
-  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  const size_t bsize = mi_page_block_size(page);
   size_t extend = page->reserved - page->capacity;
   mi_assert_internal(extend > 0);
 
-  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize);
   if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
   mi_assert_internal(max_extend > 0);
 
@@ -658,10 +659,10 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(block_size > 0);
   // set fields
   mi_page_set_heap(page, heap);
+  page->block_size = block_size;
   size_t page_size;
-  const void*  page_start = _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
-  mi_track_mem_noaccess(page_start,page_size);
-  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE);
+  page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
+  mi_track_mem_noaccess(page->page_start,page_size);  
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -673,20 +674,14 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   #if MI_DEBUG>2
   if (page->is_zero_init) {
     mi_track_mem_defined(page_start, page_size);
-    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size));
+    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page->page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
     page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
   }
-  if (block_size > 0) {
-    const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page;
-    const ptrdiff_t start_adjust = start_offset % block_size;
-    if (start_offset >= 0 && (start_adjust % MI_MAX_ALIGN_SIZE) == 0 && (start_adjust / MI_MAX_ALIGN_SIZE) < 255) {
-      const ptrdiff_t adjust = (start_adjust / MI_MAX_ALIGN_SIZE);
-      mi_assert_internal(adjust + 1 == (uint8_t)(adjust + 1));
-      page->block_offset_adj = (uint8_t)(adjust + 1);
-    }
+  else {
+    page->block_size_shift = 0;
   }
 
   mi_assert_internal(page->capacity == 0);
@@ -701,8 +696,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift)));
-  mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - MI_MAX_ALIGN_SIZE*(page->block_offset_adj-1))) % block_size == 0);
+  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
@@ -827,40 +821,31 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// A huge page is allocated directly without being in a queue.
-// Because huge pages contain just one block, and the segment contains
-// just that page, we always treat them as abandoned and any thread
-// that frees the block can free the whole page and segment directly.
-// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX).
+// Huge pages contain just one block, and the segment contains just that page. 
+// Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
+// so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
   size_t block_size = _mi_os_good_alloc_size(size);
   mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_HUGE_OBJ_SIZE_MAX); // not block_size as that can be low if the page_alignment > 0
-  mi_assert_internal(mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, block_size); 
+  // mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
-  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size,page_alignment);
-  if (page != NULL) {
-    const size_t bsize = mi_page_block_size(page);  // note: not `mi_page_usable_block_size` as `size` includes padding already
-    mi_assert_internal(bsize >= size);
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
+  if (page != NULL) {    
+    mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
+    mi_assert_internal(mi_page_is_huge(page));
     mi_assert_internal(_mi_page_segment(page)->used==1);
     #if MI_HUGE_PAGE_ABANDON
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
     mi_page_set_heap(page, NULL);
     #endif
-
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      mi_heap_stat_increase(heap, giant, bsize);
-      mi_heap_stat_counter_increase(heap, giant_count, 1);
-    }
-    else {
-      mi_heap_stat_increase(heap, huge, bsize);
-      mi_heap_stat_counter_increase(heap, huge_count, 1);
-    }
+    mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
+    mi_heap_stat_counter_increase(heap, huge_count, 1);
   }
   return page;
 }
@@ -927,7 +912,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   mi_assert_internal(mi_page_block_size(page) >= size);
 
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
-  if mi_unlikely(zero && page->xblock_size == 0) {
+  if mi_unlikely(zero && page->block_size == 0) {
     // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
     void* p = _mi_page_malloc(heap, page, size, false);
     mi_assert_internal(p != NULL);
diff --git a/src/segment.c b/src/segment.c
index 359815ce..a4fd26e9 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -412,13 +412,13 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 #endif
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size == 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
 // Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
 {
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
@@ -437,11 +437,15 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
   }
 
   if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(page->xblock_size==0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_page(p) == page);
   mi_assert_internal(_mi_ptr_segment(p) == segment);
   return p;
 }
 
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size) {
+  return mi_segment_page_start_ex(segment, page, mi_page_block_size(page), page_size, pre_size);
+}
+
 static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
 {
   const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
@@ -707,15 +711,19 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_seg
   page->is_zero_init = false;
   page->segment_in_use = false;
 
-  // zero the page data, but not the segment fields and capacity, and block_size (for page size calculations)
-  uint32_t block_size = page->xblock_size;
+  // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations)
+  size_t block_size = page->block_size;
+  uint8_t block_size_shift = page->block_size_shift;
+  uint8_t* page_start = page->page_start;
   uint16_t capacity = page->capacity;
   uint16_t reserved = page->reserved;
   ptrdiff_t ofs = offsetof(mi_page_t,capacity);
   _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs);
   page->capacity = capacity;
   page->reserved = reserved;
-  page->xblock_size = block_size;
+  page->block_size = block_size;
+  page->block_size_shift = block_size_shift;
+  page->page_start = page_start;
   segment->used--;
 
   // schedule purge
@@ -831,7 +839,6 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 // Possibly clear pages and check if free space is available
 static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
   bool has_page = false;
   size_t pages_used = 0;
   size_t pages_used_empty = 0;
@@ -847,7 +854,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
         pages_used_empty++;
         has_page = true;
       }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+      else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) {
         // a page has available free blocks of the right size
         has_page = true;
       }
@@ -901,7 +908,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       else {
         // otherwise reclaim it into the heap
         _mi_page_reclaim(heap, page);
-        if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+        if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page)) {
           if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
         }
       }
@@ -1008,7 +1015,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
   mi_assert_internal(page_kind <= MI_PAGE_LARGE);
-  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
 
   // 1. try to reclaim an abandoned segment
   bool reclaimed;
@@ -1077,7 +1084,7 @@ static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_p
   mi_assert_internal(page != NULL);
   #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
   // verify it is committed
-  _mi_segment_page_start(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_page_start_ex(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
   #endif
   return page;
 }
@@ -1100,7 +1107,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  _mi_segment_page_start(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_page_start_ex(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
 #endif
   return page;
 }
@@ -1117,11 +1124,11 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 
-  // for huge pages we initialize the xblock_size as we may
+  // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
-  uint8_t* start = _mi_segment_page_start(segment, page, 0, &psize, NULL);
-  page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
+  uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
+  page->block_size = psize;
 
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
diff --git a/src/stats.c b/src/stats.c
index 8fbdfc45..5dfd713b 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -117,8 +117,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
-  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
     if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
@@ -316,12 +315,10 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   #if MI_STAT
   mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
-  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
   mi_stat_count_t total = { 0,0,0,0 };
   mi_stat_add(&total, &stats->normal, 1);
   mi_stat_add(&total, &stats->huge, 1);
-  mi_stat_add(&total, &stats->giant, 1);
   mi_stat_print(&total, "total", 1, out, arg);
   #endif
   #if MI_STAT>1

From a8a53e3e85fbe8c8f997078399ee089880614ebf Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:50:15 -0700
Subject: [PATCH 072/119] fix double counting of free-ing for non-thread-local
 free calls

---
 include/mimalloc/internal.h |  2 +-
 src/alloc-aligned.c         |  2 +-
 src/free.c                  | 27 ++++++++++++++-------------
 src/options.c               |  2 +-
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 21dc9d62..29943357 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -200,7 +200,7 @@ void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool
 void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
 void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 5f60b2fc..b63c5e43 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -69,7 +69,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   // todo: expand padding if overallocated ?
 
   mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
-  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_page(aligned_p), aligned_p));
   mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
   mi_assert_internal(mi_usable_size(aligned_p)>=size);
   mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust);
diff --git a/src/free.c b/src/free.c
index c66de6f6..39443ccf 100644
--- a/src/free.c
+++ b/src/free.c
@@ -29,17 +29,17 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
-static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool check_full)
+static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full)
 {
   // checks
   if mi_unlikely(mi_check_is_double_free(page, block)) return;
   mi_check_padding(page, block);
-  mi_stat_free(page, block);
+  if (track_stats) { mi_stat_free(page, block); }
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-  mi_track_free_size(p, mi_page_usable_size_of(page,block)); // faster then mi_usable_size as we already know the page and that p is unaligned
-
+  if (track_stats) { mi_track_free_size(p, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  
   // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);
   page->local_free = block;
@@ -52,7 +52,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
-mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
   size_t diff = (uint8_t*)p - page->page_start;
@@ -69,13 +69,14 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 
 // free a local pointer  (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true);
+  MI_UNUSED(segment);
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
+  mi_free_block_local(page, block, true, true);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
+  mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_free_block_mt(segment, page, block);
 }
 
@@ -135,7 +136,7 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, true, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
@@ -170,7 +171,7 @@ bool _mi_free_delayed_block(mi_block_t* block) {
   _mi_page_free_collect(page, false);
 
   // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, true);
+  mi_free_block_local(page, block, false /* stats have already been adjusted */, true);
   return true;
 }
 
@@ -287,8 +288,8 @@ static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t*
 // ------------------------------------------------------
 
 // Bytes available in a block
-static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
-  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(page, p);
   const size_t size = mi_page_usable_size_of(page, block);
   const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
   mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
@@ -305,7 +306,7 @@ static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noe
   }
   else {
     // split out to separate routine for improved code generation
-    return mi_page_usable_aligned_size_of(segment, page, p);
+    return mi_page_usable_aligned_size_of(page, p);
   }
 }
 
diff --git a/src/options.c b/src/options.c
index f8e928d0..8a84d344 100644
--- a/src/options.c
+++ b/src/options.c
@@ -91,7 +91,7 @@ static mi_option_desc_t options[_mi_option_last] =
 
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 9c96d05ee435a8931c685f33cc115f58765e530b Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sun, 24 Mar 2024 14:52:50 -0700
Subject: [PATCH 073/119] abandoned reclaim on free is on by default

---
 src/free.c    | 6 +++---
 src/options.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/free.c b/src/free.c
index 39443ccf..87847f21 100644
--- a/src/free.c
+++ b/src/free.c
@@ -25,7 +25,7 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // ------------------------------------------------------
 
 // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
@@ -77,7 +77,7 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t
 // free a pointer owned by another thread (page parameter comes first for better codegen)
 static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
-  mi_free_block_mt(segment, page, block);
+  mi_free_block_mt(page, segment, block);
 }
 
 // generic free (for runtime integration)
@@ -230,7 +230,7 @@ static void mi_stat_huge_free(const mi_page_t* page);
 #endif
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_segment_t* segment, mi_page_t* page, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
   // first see if the segment was abandoned and if we can reclaim it into our thread
   if (mi_option_is_enabled(mi_option_abandoned_reclaim_on_free) &&
diff --git a/src/options.c b/src/options.c
index 8a84d344..f8e928d0 100644
--- a/src/options.c
+++ b/src/options.c
@@ -91,7 +91,7 @@ static mi_option_desc_t options[_mi_option_last] =
 
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 0,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From 006ae2d055ea6a7d847621963dc85a8c39423fa7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 24 Mar 2024 17:07:28 -0700
Subject: [PATCH 074/119] add is_huge page flag to ensure the right page queue
 is returned (see #868)

---
 include/mimalloc/internal.h |  4 +++-
 include/mimalloc/types.h    |  5 +++--
 src/alloc.c                 |  2 +-
 src/init.c                  |  8 +++++---
 src/page-queue.c            | 34 ++++++++++++++++++----------------
 src/page.c                  | 19 +++++++++----------
 src/segment.c               |  9 +++++++--
 7 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 29943357..4df8ca68 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -470,7 +470,9 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
+  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
+                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
+  return page->is_huge;
 }
 
 // Get the usable block size of a page without fixed padding.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 69d59527..6b22c83e 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -278,6 +278,7 @@ typedef struct mi_page_s {
   uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
   uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
   uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
+  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
 
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
@@ -285,7 +286,7 @@ typedef struct mi_page_s {
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized 
   uint8_t               retire_expire:7;   // expiration count for retired blocks
                                            // padding
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
diff --git a/src/alloc.c b/src/alloc.c
index 8b6c4de0..2e03eca0 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
diff --git a/src/init.c b/src/init.c
index 79175f81..604809ad 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,7 +14,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, false, false, false,
+  0, 
+  false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
   0,       // used
@@ -78,9 +79,10 @@ const mi_page_t _mi_page_empty = {
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
   MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), \
   { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
-  { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
   MI_STAT_COUNT_END_NULL()
 
 // --------------------------------------------------------
diff --git a/src/page-queue.c b/src/page-queue.c
index 095f9b81..e4bfde14 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -141,21 +141,21 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t*
 }
 #endif
 
-static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
-  mi_heap_t* heap = mi_page_heap(page);
-  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  mi_assert_internal(heap!=NULL);
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page))));
+  mi_assert_internal(bin <= MI_BIN_FULL);
   mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(bin >= MI_BIN_HUGE || mi_page_block_size(page) == pq->block_size);
-  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  mi_assert_internal((mi_page_block_size(page) == pq->block_size) ||
+                       (mi_page_is_huge(page) && mi_page_queue_is_huge(pq)) ||
+                         (mi_page_is_in_full(page) && mi_page_queue_is_full(pq)));
   return pq;
 }
 
-static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
-  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(mi_page_block_size(page)));
-  mi_assert_internal(bin <= MI_BIN_FULL);
-  mi_page_queue_t* pq = &heap->pages[bin];
-  mi_assert_internal(mi_page_is_in_full(page) || mi_page_block_size(page) == pq->block_size);
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
   return pq;
 }
 
@@ -210,7 +210,9 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
 static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
   mi_assert_internal(page != NULL);
   mi_assert_expensive(mi_page_queue_contains(queue, page));
-  mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_assert_internal(mi_page_block_size(page) == queue->block_size || 
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || 
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
   if (page->next != NULL) page->next->prev = page->prev;
@@ -236,7 +238,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
   mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
   #endif
   mi_assert_internal(mi_page_block_size(page) == queue->block_size ||
-                      (mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+                      (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) ||
                         (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
 
   mi_page_set_in_full(page, mi_page_queue_is_full(queue));
@@ -267,8 +269,8 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
   mi_assert_internal((bsize == to->block_size && bsize == from->block_size) ||
                      (bsize == to->block_size && mi_page_queue_is_full(from)) ||
                      (bsize == from->block_size && mi_page_queue_is_full(to)) ||
-                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
-                     (bsize > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+                     (mi_page_is_huge(page) && mi_page_queue_is_huge(to)) ||
+                     (mi_page_is_huge(page) && mi_page_queue_is_full(to)));
 
   mi_heap_t* heap = mi_page_heap(page);
   if (page->prev != NULL) page->prev->next = page->next;
diff --git a/src/page.c b/src/page.c
index ef58d883..d36421f4 100644
--- a/src/page.c
+++ b/src/page.c
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->used <= page->capacity);
   mi_assert_internal(page->capacity <= page->reserved);
 
-  const size_t bsize = mi_page_block_size(page);
+  // const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
   mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
@@ -448,8 +448,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   // for now, we don't retire if it is the only page left of this size class.
   mi_page_queue_t* pq = mi_page_queue_of(page);
   const size_t bsize = mi_page_block_size(page);
-  if mi_likely(bsize < MI_MAX_RETIRE_SIZE) {  // not too large && not full or huge queue?
-    mi_assert_internal(!mi_page_queue_is_special(pq));
+  if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
       page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
@@ -662,7 +661,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->block_size = block_size;
   size_t page_size;
   page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
-  mi_track_mem_noaccess(page->page_start,page_size);  
+  mi_track_mem_noaccess(page->page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -821,7 +820,7 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex
   General allocation
 ----------------------------------------------------------- */
 
-// Huge pages contain just one block, and the segment contains just that page. 
+// Huge pages contain just one block, and the segment contains just that page.
 // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX)
 // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`.
 static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
@@ -830,15 +829,15 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, block_size); 
-  // mi_assert_internal(mi_page_queue_is_huge(pq));
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
-  if (page != NULL) {    
+  if (page != NULL) {
     mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
     mi_assert_internal(mi_page_is_huge(page));
+    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
     mi_assert_internal(_mi_page_segment(page)->used==1);
     #if MI_HUGE_PAGE_ABANDON
     mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
diff --git a/src/segment.c b/src/segment.c
index a4fd26e9..cec3079e 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -142,6 +142,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1);
   size_t nfree = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
@@ -151,6 +152,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
     if (page->segment_in_use) {
       mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
+    if (segment->page_kind == MI_PAGE_HUGE) mi_assert_internal(page->is_huge);
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
   // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
@@ -615,11 +617,13 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   _mi_memzero((uint8_t*)segment + ofs, info_size - ofs);
 
   // initialize pages info
+  const bool is_huge = (page_kind == MI_PAGE_HUGE);
   for (size_t i = 0; i < capacity; i++) {
     mi_assert_internal(i <= 255);
     segment->pages[i].segment_idx = (uint8_t)i;
     segment->pages[i].is_committed = segment->memid.initially_committed;
     segment->pages[i].is_zero_init = segment->memid.initially_zero;
+    segment->pages[i].is_huge = is_huge;
   }
 
   // initialize
@@ -753,7 +757,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
       mi_segment_abandon(segment,tld);
     }
     else if (segment->used + 1 == segment->capacity) {
-      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
+      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment
       if (segment->page_kind <= MI_PAGE_MEDIUM) {
         // move back to segments  free list
         mi_segment_insert_in_free_queue(segment,tld);
@@ -1123,13 +1127,14 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   #endif
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
+  mi_assert_internal(page->is_huge);
 
   // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
   uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
   page->block_size = psize;
-
+  
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);

From 00228021776780326de4a99e540eadc002909cda Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Mon, 25 Mar 2024 15:25:04 -0700
Subject: [PATCH 075/119] commend and make at least 8 tries for reclaim

---
 include/mimalloc/internal.h |  5 ++--
 include/mimalloc/types.h    | 53 +++++++++++++++++++++++--------------
 src/heap.c                  |  2 +-
 src/init.c                  |  6 ++---
 src/libc.c                  |  4 +--
 src/page.c                  |  9 ++++---
 src/segment.c               | 51 ++++++++++++++++++-----------------
 src/stats.c                 | 46 +++++++++++++++-----------------
 8 files changed, 96 insertions(+), 80 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 4df8ca68..12436ca4 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,7 +147,7 @@ void       _mi_segment_map_freed_at(const mi_segment_t* segment);
 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
 void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
 
 #if MI_HUGE_PAGE_ABANDON
 void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
@@ -454,6 +454,7 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const
 // Quick page start for initialized pages
 static inline uint8_t* mi_page_start(const mi_page_t* page) {
   mi_assert_internal(page->page_start != NULL);
+  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
   return page->page_start;
 }
 
@@ -466,7 +467,7 @@ static inline mi_page_t* _mi_ptr_page(void* p) {
 // Get the block size of a page (special case for huge objects)
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
-  return page->block_size;  
+  return page->block_size;
 }
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6b22c83e..adfd7838 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -16,6 +16,8 @@ terms of the MIT license. A copy of the license can be found in the file
 //                  are allocated.
 // mi_page_t      : a mimalloc page (usually 64KiB or 512KiB) from
 //                  where objects are allocated.
+//                  Note: we write "OS page" for OS memory pages while
+//                  using plain "page" for mimalloc pages (`mi_page_t`).
 // --------------------------------------------------------------------------
 
 
@@ -89,10 +91,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif
 
 
-// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
-// but that makes it not possible to visit them during a heap walk or include them in a
-// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
-// another thread so most memory is available until it gets properly freed by the owning thread.
+// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread.
+// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from
+// another thread so the memory becomes "virtually" available (and eventually gets properly freed by
+// the owning thread).
 // #define MI_HUGE_PAGE_ABANDON 1
 
 
@@ -160,7 +163,7 @@ typedef int32_t  mi_ssize_t;
 #define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
 #define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
 #define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
 
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
@@ -215,7 +218,7 @@ typedef enum mi_delayed_e {
   MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
   MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
   MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
 } mi_delayed_t;
 
 
@@ -264,7 +267,7 @@ typedef uintptr_t mi_thread_free_t;
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
 // - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 14 words on 32-bit, and encoded free lists add 2 words)
+//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@@ -283,14 +286,15 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
   uint16_t              reserved;          // number of blocks reserved in memory
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized 
+  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
   uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
+
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
+  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+                                           // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
 
@@ -304,7 +308,7 @@ typedef struct mi_page_s {
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
 
-  #if MI_INTPTR_SIZE==4                    // pad to 14 words on 32-bit
+  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
   void* padding[1];
   #endif
 } mi_page_t;
@@ -319,17 +323,22 @@ typedef enum mi_page_kind_e {
   MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
   MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
   MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // huge blocks (>512KiB) are put into a single page in a segment of the exact size (but still 2MiB aligned)
+  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+                    // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
 
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
 // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
 typedef enum mi_memkind_e {
   MI_MEM_NONE,      // not allocated
   MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
   MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
   MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge os pages
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
   MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
   MI_MEM_ARENA      // allocated from an arena (the usual case)
 } mi_memkind_t;
@@ -346,7 +355,7 @@ typedef struct mi_memid_os_info {
 typedef struct mi_memid_arena_info {
   size_t        block_index;        // index in the arena
   mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // the arena can only be used for specific arena allocations
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;
 
 typedef struct mi_memid_s {
@@ -354,19 +363,22 @@ typedef struct mi_memid_s {
     mi_memid_os_info_t    os;       // only used for MI_MEM_OS
     mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
   } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
   bool          initially_committed;// `true` if the memory was originally allocated as committed
   bool          initially_zero;     // `true` if the memory was originally zero initialized
   mi_memkind_t  memkind;
 } mi_memid_t;
 
 
-// Segments are large allocated memory blocks (2MiB on 64 bit) from
-// the OS. Inside segments we allocated fixed size _pages_ that
-// contain blocks.
+// ---------------------------------------------------------------
+// Segments contain mimalloc pages
+// ---------------------------------------------------------------
+
+// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS. 
+// Inside segments we allocated fixed size _pages_ that contain blocks.
 typedef struct mi_segment_s {
   // constant fields
-  mi_memid_t           memid;            // id for the os-level memory manager
+  mi_memid_t           memid;            // memory id to track provenance
   bool                 allow_decommit;
   bool                 allow_purge;
   size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
@@ -572,6 +584,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
diff --git a/src/heap.c b/src/heap.c
index 21cdfa46..bcb5a41c 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -508,7 +508,7 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   const size_t bsize = mi_page_block_size(page);
   const size_t ubsize = mi_page_usable_block_size(page); // without padding
   size_t   psize;
-  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
+  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
 
   if (page->capacity == 1) {
     // optimize page with one block
diff --git a/src/init.c b/src/init.c
index 604809ad..f5fa2ad8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -14,17 +14,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  0, 
+  0,
   false, false, false, false,
   0,       // capacity
   0,       // reserved capacity
-  0,       // used
   { 0 },   // flags
-  0,       // block size shift
   false,   // is_zero
   0,       // retire_expire
   NULL,    // free
   NULL,    // local_free
+  0,       // used
+  0,       // block size shift
   0,       // block_size
   NULL,    // page_start
   #if (MI_PADDING || MI_ENCODE_FREELIST)
diff --git a/src/libc.c b/src/libc.c
index f1412722..dd6b4007 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -210,7 +210,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
         if (c == 'x' || c == 'u') {
           if (numtype == 'z')       x = va_arg(args, size_t);
           else if (numtype == 't')  x = va_arg(args, uintptr_t); // unsigned ptrdiff_t
-          else if (numtype == 'L')  x = va_arg(args, unsigned long long);
+          else if (numtype == 'L')  x = (uintptr_t)va_arg(args, unsigned long long);
                                else x = va_arg(args, unsigned long);
         }
         else if (c == 'p') {
@@ -231,7 +231,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) {
         intptr_t x = 0;
         if (numtype == 'z')       x = va_arg(args, intptr_t );
         else if (numtype == 't')  x = va_arg(args, ptrdiff_t);
-        else if (numtype == 'L')  x = va_arg(args, long long);
+        else if (numtype == 'L')  x = (intptr_t)va_arg(args, long long);
                              else x = va_arg(args, long);
         char pre = 0;
         if (x < 0) {
diff --git a/src/page.c b/src/page.c
index d36421f4..7a333cb4 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize, NULL);
+  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -85,7 +85,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   // const size_t bsize = mi_page_block_size(page);
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL,NULL));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -616,7 +617,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size, NULL);
+  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
@@ -660,7 +661,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_page_set_heap(page, heap);
   page->block_size = block_size;
   size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size, NULL);
+  page->page_start = _mi_segment_page_start(segment, page, &page_size);
   mi_track_mem_noaccess(page->page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
diff --git a/src/segment.c b/src/segment.c
index cec3079e..ff20b504 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -25,14 +25,15 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
   - small pages (64KiB), 64 in one segment
   - medium pages (512KiB), 8 in one segment
   - large pages (4MiB), 1 in one segment
-  - huge blocks > MI_LARGE_OBJ_SIZE_MAX become large segment with 1 page
+  - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`.
+    it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`.
 
-  In any case the memory for a segment is virtual and usually committed on demand.
+  The memory for a segment is usually committed on demand.
   (i.e. we are careful to not touch the memory until we actually allocate a block there)
 
-  If a  thread ends, it "abandons" pages with used blocks
-  and there is an abandoned segment list whose segments can
-  be reclaimed by still running threads, much like work-stealing.
+  If a  thread ends, it "abandons" pages that still contain live blocks.
+  Such segments are abondoned and these can be reclaimed by still running threads,
+  (much like work-stealing).
 -------------------------------------------------------------------------------- */
 
 
@@ -142,7 +143,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
   mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
   mi_assert_internal(segment->used <= segment->capacity);
   mi_assert_internal(segment->abandoned <= segment->used);
-  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment
   size_t nfree = 0;
   for (size_t i = 0; i < segment->capacity; i++) {
     const mi_page_t* const page = &segment->pages[i];
@@ -152,7 +153,7 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
     if (page->segment_in_use) {
       mi_assert_expensive(!mi_pages_purge_contains(page, tld));
     }
-    if (segment->page_kind == MI_PAGE_HUGE) mi_assert_internal(page->is_huge);
+    mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   }
   mi_assert_internal(nfree + segment->used == segment->capacity);
   // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
@@ -420,11 +421,11 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
 }
 
 // Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
 {
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
-  if (pre_size != NULL) *pre_size = 0;
+  const size_t block_size = mi_page_block_size(page);
   if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
     // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
     size_t adjust = block_size - ((uintptr_t)p % block_size);
@@ -432,7 +433,7 @@ static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_p
       if (adjust < block_size) {
         p += adjust;
         psize -= adjust;
-        if (pre_size != NULL) *pre_size = adjust;
+        // if (pre_size != NULL) *pre_size = adjust;
       }
       mi_assert_internal((uintptr_t)p % block_size == 0);
     }
@@ -444,9 +445,6 @@ static uint8_t* mi_segment_page_start_ex(const mi_segment_t* segment, const mi_p
   return p;
 }
 
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size, size_t* pre_size) {
-  return mi_segment_page_start_ex(segment, page, mi_page_block_size(page), page_size, pre_size);
-}
 
 static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size)
 {
@@ -961,26 +959,31 @@ void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
 }
 
 static long mi_segment_get_reclaim_tries(void) {
-  // limit the tries to 10% (default) of the abandoned segments with at least 8 tries, and at most 1024.
+  // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return 0;
   const size_t total_count = _mi_arena_segment_abandoned_count();
+  if (total_count == 0) return 0;
   const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
-  long max_tries = (long)(relative_count < 8 ? 8 : (relative_count > 1024 ? 1024 : relative_count));
+  long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
+  if (max_tries < 8 && total_count > 8) { max_tries = 8;  }
   return max_tries;
 }
 
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
-  mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap,&current);
   long max_tries = mi_segment_get_reclaim_tries();
+  if (max_tries <= 0) return NULL;
+
+  mi_segment_t* segment;
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
     segment->abandoned_visits++;
-    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
-    // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
+    // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
+    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
+    // Perhaps we can skip non-suitable ones in a better way?
     bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
     bool all_pages_free;
     bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees)
@@ -1088,7 +1091,7 @@ static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_p
   mi_assert_internal(page != NULL);
   #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
   // verify it is committed
-  mi_segment_page_start_ex(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0;
   #endif
   return page;
 }
@@ -1111,7 +1114,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size
   mi_page_t* page = mi_segment_find_free(segment, tld);
   mi_assert_internal(page != NULL);
 #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN
-  mi_segment_page_start_ex(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+  mi_segment_raw_page_start(segment, page, NULL)[0] = 0;
 #endif
   return page;
 }
@@ -1132,9 +1135,9 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment,
   // for huge pages we initialize the block_size as we may
   // overallocate to accommodate large alignments.
   size_t psize;
-  uint8_t* start = mi_segment_page_start_ex(segment, page, 0, &psize, NULL);
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
   page->block_size = psize;
-  
+
   // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
   if (page_alignment > 0 && segment->allow_decommit && page->is_committed) {
     uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
diff --git a/src/stats.c b/src/stats.c
index 5dfd713b..99cf89c5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -174,13 +174,28 @@ static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* ar
 
 static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
   _mi_fprintf(out, arg,"%10s:", msg);
-  if (unit > 0) {
-    mi_print_amount(stat->peak, unit, out, arg);
-    mi_print_amount(stat->allocated, unit, out, arg);
-    mi_print_amount(stat->freed, unit, out, arg);
-    mi_print_amount(stat->current, unit, out, arg);
-    mi_print_amount(unit, 1, out, arg);
-    mi_print_count(stat->allocated, unit, out, arg);
+  if (unit != 0) {
+    if (unit > 0) {
+      mi_print_amount(stat->peak, unit, out, arg);
+      mi_print_amount(stat->allocated, unit, out, arg);
+      mi_print_amount(stat->freed, unit, out, arg);
+      mi_print_amount(stat->current, unit, out, arg);
+      mi_print_amount(unit, 1, out, arg);
+      mi_print_count(stat->allocated, unit, out, arg);
+    }
+    else {
+      mi_print_amount(stat->peak, -1, out, arg);
+      mi_print_amount(stat->allocated, -1, out, arg);
+      mi_print_amount(stat->freed, -1, out, arg);
+      mi_print_amount(stat->current, -1, out, arg);
+      if (unit == -1) {
+        _mi_fprintf(out, arg, "%24s", "");
+      }
+      else {
+        mi_print_amount(-unit, 1, out, arg);
+        mi_print_count((stat->allocated / -unit), 0, out, arg);
+      }
+    }
     if (stat->allocated > stat->freed) {
       _mi_fprintf(out, arg, "  ");
       _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok));
@@ -190,23 +205,6 @@ static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64
       _mi_fprintf(out, arg, "  ok\n");
     }
   }
-  else if (unit<0) {
-    mi_print_amount(stat->peak, -1, out, arg);
-    mi_print_amount(stat->allocated, -1, out, arg);
-    mi_print_amount(stat->freed, -1, out, arg);
-    mi_print_amount(stat->current, -1, out, arg);
-    if (unit==-1) {
-      _mi_fprintf(out, arg, "%24s", "");
-    }
-    else {
-      mi_print_amount(-unit, 1, out, arg);
-      mi_print_count((stat->allocated / -unit), 0, out, arg);
-    }
-    if (stat->allocated > stat->freed)
-      _mi_fprintf(out, arg, "  not all freed!\n");
-    else
-      _mi_fprintf(out, arg, "  ok\n");
-  }
   else {
     mi_print_amount(stat->peak, 1, out, arg);
     mi_print_amount(stat->allocated, 1, out, arg);

From 460278f1102bf74d49aa7de402fde1053377fbb7 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Mon, 25 Mar 2024 16:02:20 -0700
Subject: [PATCH 076/119] comments

---
 src/free.c    | 41 +++++++----------------------------------
 src/segment.c |  2 +-
 2 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/src/free.c b/src/free.c
index 87847f21..59c20aed 100644
--- a/src/free.c
+++ b/src/free.c
@@ -71,7 +71,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
 static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
   MI_UNUSED(segment);
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
-  mi_free_block_local(page, block, true, true);
+  mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
@@ -136,7 +136,7 @@ void mi_free(void* p) mi_attr_noexcept
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
-      mi_free_block_local(page, block, true, false /* no need to check if the page is full */);
+      mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
@@ -167,11 +167,11 @@ bool _mi_free_delayed_block(mi_block_t* block) {
     return false;
   }
 
-  // collect all other non-local frees to ensure up-to-date `used` count
+  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
   _mi_page_free_collect(page, false);
 
-  // and free the block (possibly freeing the page as well since used is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true);
+  // and free the block (possibly freeing the page as well since `used` is updated)
+  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
   return true;
 }
 
@@ -225,10 +225,6 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
   }
 }
 
-#if MI_HUGE_PAGE_ABANDON
-static void mi_stat_huge_free(const mi_page_t* page);
-#endif
-
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
 static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
 {
@@ -251,7 +247,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
   mi_check_padding(page, block);
 
-  // adjust stats (after padding check and potential recursive `mi_free` above)
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
   mi_stat_free(page, block);    // stat_free may access the padding
   mi_track_free_size(block, mi_page_usable_size_of(page,block));
 
@@ -261,7 +257,6 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   if (segment->page_kind == MI_PAGE_HUGE) {
     #if MI_HUGE_PAGE_ABANDON
     // huge page segments are always abandoned and can be freed immediately
-    mi_stat_huge_free(page);
     _mi_segment_huge_page_free(segment, page, block);
     return;
     #else
@@ -510,35 +505,13 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
 #endif
   }
-#if !MI_HUGE_PAGE_ABANDON
   else {
-    const size_t bpsize = mi_page_block_size(page);
+    const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
     mi_heap_stat_decrease(heap, huge, bpsize);
   }
-#endif
 }
 #else
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
   MI_UNUSED(page); MI_UNUSED(block);
 }
 #endif
-
-#if MI_HUGE_PAGE_ABANDON
-#if (MI_STAT>0)
-// maintain stats for huge objects
-static void mi_stat_huge_free(const mi_page_t* page) {
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
-  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, huge, bsize);
-  }
-  else {
-    mi_heap_stat_decrease(heap, giant, bsize);
-  }
-}
-#else
-static void mi_stat_huge_free(const mi_page_t* page) {
-  MI_UNUSED(page);
-}
-#endif
-#endif
diff --git a/src/segment.c b/src/segment.c
index ff20b504..91ff9adb 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -1167,7 +1167,7 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
     mi_block_set_next(page, block, page->free);
     page->free = block;
     page->used--;
-    page->is_zero = false;
+    page->is_zero_init = false;
     mi_assert(page->used == 0);
     mi_tld_t* tld = heap->tld;
     mi_segments_track_size((long)segment->segment_size, &tld->segments);

From 1d8997236cb8e09103436491c8aa34548aa2cb24 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:28:15 -0700
Subject: [PATCH 077/119] add comment on concurrent access in ptr_unalign

---
 src/free.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/free.c b/src/free.c
index 59c20aed..bf5498a4 100644
--- a/src/free.c
+++ b/src/free.c
@@ -52,6 +52,9 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 }
 
 // Adjust a block that was allocated aligned, to the actual start of the block in the page.
+// note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the 
+// `page_start` and `block_size` fields; however these are constant and the page won't be 
+// deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently.
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 

From a5228992366b7ff498635238e09aa6bf6fea642d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:32:19 -0700
Subject: [PATCH 078/119] fix compilation with ASAN

---
 src/free.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/free.c b/src/free.c
index bf5498a4..43e1f76f 100644
--- a/src/free.c
+++ b/src/free.c
@@ -38,7 +38,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN
   memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
   #endif
-  if (track_stats) { mi_track_free_size(p, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
+  if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned
   
   // actual free: push on the local free list
   mi_block_set_next(page, block, page->local_free);

From 6b4f3f6223d7bc3ec5f4119ef14ed45582c2033d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 25 Mar 2024 16:37:46 -0700
Subject: [PATCH 079/119] further ASAN fix

---
 src/page.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/page.c b/src/page.c
index 7a333cb4..efcd8d91 100644
--- a/src/page.c
+++ b/src/page.c
@@ -673,8 +673,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->free_is_zero = page->is_zero_init;
   #if MI_DEBUG>2
   if (page->is_zero_init) {
-    mi_track_mem_defined(page_start, page_size);
-    mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page->page_start, page_size));
+    mi_track_mem_defined(page->page_start, page_size);
+    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {

From 8f7d1e9a41bb0182166aac6a8d4d8b00f60ed032 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 29 Mar 2024 11:17:21 -0700
Subject: [PATCH 080/119] fix free in realpath when using ASAN

---
 src/alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/alloc.c b/src/alloc.c
index 2e03eca0..32175b0c 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -396,7 +396,8 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name)
     char* rname = realpath(fname, NULL);
     if (rname == NULL) return NULL;
     char* result = mi_heap_strdup(heap, rname);
-    free(rname);  // use regular free! (which may be redirected to our free but that's ok)
+    mi_cfree(rname);  // use checked free (which may be redirected to our free but that's ok)
+    // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-(
     return result;
   }
   /*

From 10721ddbfd5827cca9a81a06ab6fccdd33a8a0f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Teodor=20Sp=C3=A6ren?= <teodor@sparen.no>
Date: Sun, 31 Mar 2024 23:18:52 +0200
Subject: [PATCH 081/119] Remove unneeded include

---
 include/mimalloc.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index c125932f..b3f60a34 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -487,7 +487,6 @@ template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const
 #define MI_HAS_HEAP_STL_ALLOCATOR 1
 
 #include <memory>      // std::shared_ptr
-#include "mimalloc/types.h"
 
 // Common base class for STL allocators in a specific heap
 template<class T, bool _mi_destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {

From 764aa44598f3669952ce65d064f14ac725671116 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 09:38:40 -0700
Subject: [PATCH 082/119] remove macOS preprocessor macros that do not exist
 (anymore). issue #879

---
 src/prim/unix/prim.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 87ad63b1..9c4ecd4b 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -42,8 +42,8 @@ terms of the MIT license. A copy of the license can be found in the file
 #elif defined(__APPLE__)
   #include <AvailabilityMacros.h>
   #include <TargetConditionals.h>
-  #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
-  #include <mach/vm_statistics.h>
+  #if !defined(TARGET_OS_OSX) || TARGET_OS_OSX   // see issue #879, used to be (!TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR)
+  #include <mach/vm_statistics.h>    // VM_MAKE_TAG, VM_FLAGS_SUPERPAGE_SIZE_2MB, etc.
   #endif
   #if !defined(MAC_OS_X_VERSION_10_7)
   #define MAC_OS_X_VERSION_10_7   1070

From 06b510c42d23e19602c595e5172bd7363a73714d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 09:41:24 -0700
Subject: [PATCH 083/119] fix build pipeline for ASAN

---
 test/test-api.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test-api.c b/test/test-api.c
index 6dd2bc7f..8b6378bf 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -295,11 +295,13 @@ int main(void) {
   // ---------------------------------------------------
   // various
   // ---------------------------------------------------
+  #if !defined(MI_TRACK_ASAN)   // realpath may leak with ASAN enabled (as the ASAN allocator intercepts it)
   CHECK_BODY("realpath") {
     char* s = mi_realpath( ".", NULL );
     // printf("realpath: %s\n",s);
     mi_free(s);
   };
+  #endif
 
   CHECK("stl_allocator1", test_stl_allocator1());
   CHECK("stl_allocator2", test_stl_allocator2());

From 5050b630389bb4d2013d1e82cc1b2c2d9b65b963 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 09:53:21 -0700
Subject: [PATCH 084/119] define MI_MAX_ALLOC_SIZE as PTRDIFF_MAX (related to
 #877)

---
 include/mimalloc/types.h | 2 ++
 src/page.c               | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index adfd7838..5d219d68 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -195,6 +195,8 @@ typedef int32_t  mi_ssize_t;
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
+// we never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
 
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
diff --git a/src/page.c b/src/page.c
index efcd8d91..3b3ba2f9 100644
--- a/src/page.c
+++ b/src/page.c
@@ -857,7 +857,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme
   // huge allocation?
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
   if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
-    if mi_unlikely(req_size > PTRDIFF_MAX) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+    if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {  
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
     }

From bb1fafa1bbea44e0421256b9c99dfdf67452873a Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 10:34:04 -0700
Subject: [PATCH 085/119] forward strdup/strndup to avoid leaks on macOS --
 addresses PR #769

---
 src/alloc-override.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index b2c94ce2..92536976 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -130,11 +130,16 @@ typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
   // cannot override malloc unless using a dll.
   // we just override new/delete which does work in a static library.
 #else
-  // On all other systems forward to our API
+  // On all other systems forward allocation primitives to our API
   mi_decl_export void* malloc(size_t size)              MI_FORWARD1(mi_malloc, size)
   mi_decl_export void* calloc(size_t size, size_t n)    MI_FORWARD2(mi_calloc, size, n)
   mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
-  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)
+  mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)  
+  // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call)
+  mi_decl_export char* strdup(const char* str)             MI_FORWARD1(mi_strdup, str)
+  #if !defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
+  mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)   
+  #endif
 #endif
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__)

From 70eb7fb3900a9f5dfde4451ae931414815299b6b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:01:26 -0700
Subject: [PATCH 086/119] collect arenas even if not on the main thread (issue
 #878)

---
 src/arena.c |  2 +-
 src/heap.c  | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index bb5a3725..b78d69f8 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -709,7 +709,7 @@ static void mi_arenas_unsafe_destroy(void) {
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
 void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, true /* visit all */, stats);
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
 }
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
diff --git a/src/heap.c b/src/heap.c
index bcb5a41c..f5c1c840 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -121,6 +121,8 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
   _mi_deferred_free(heap, collect >= MI_FORCE);
 
+  const bool force = (collect >= MI_FORCE);
+
   // note: never reclaim on collect but leave it to threads that need storage to reclaim
   if (
   #ifdef NDEBUG
@@ -145,22 +147,24 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   _mi_heap_delayed_free_all(heap);
 
   // collect retired pages
-  _mi_heap_collect_retired(heap, collect >= MI_FORCE);
+  _mi_heap_collect_retired(heap, force);
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
   // collect segment and thread caches
-  if (collect >= MI_FORCE) {
+  if (force) {
     _mi_segment_thread_collect(&heap->tld->segments);
   }
 
-  // collect arenas on program-exit (or shared library unload)
-  if (collect >= MI_FORCE && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+  // if forced, collect thread data cache on program-exit (or shared library unload)
+  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
     _mi_thread_data_collect();  // collect thread data cache
-    _mi_arena_collect(true /* force purge */, &heap->tld->stats);
   }
+  
+  // collect arenas
+  _mi_arena_collect(force /* force purge? */, &heap->tld->stats);  
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {

From 0ea2e04902209cf9d6a8e427648d18c39e4d4216 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:06:27 -0700
Subject: [PATCH 087/119] dont purge arenas on collect unless it is an explicit
 force (issue #878)

---
 src/heap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index f5c1c840..328c708c 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -119,9 +119,9 @@ static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq
 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
-  _mi_deferred_free(heap, collect >= MI_FORCE);
 
   const bool force = (collect >= MI_FORCE);
+  _mi_deferred_free(heap, force);
 
   // note: never reclaim on collect but leave it to threads that need storage to reclaim
   if (
@@ -163,8 +163,8 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
     _mi_thread_data_collect();  // collect thread data cache
   }
   
-  // collect arenas
-  _mi_arena_collect(force /* force purge? */, &heap->tld->stats);  
+  // collect arenas (this is program wide so don't force purges on abandonment of threads)
+  _mi_arena_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {

From 32e065bb326bf630a37b35f0d1790608c5fe133b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:11:58 -0700
Subject: [PATCH 088/119] rename segment_thread_collect to segment_collect

---
 include/mimalloc/internal.h | 2 +-
 src/heap.c                  | 4 +---
 src/segment.c               | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 12436ca4..9b364f86 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -155,7 +155,7 @@ void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi
 void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif
 
-void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+void       _mi_segment_collect(bool force, mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
 bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
diff --git a/src/heap.c b/src/heap.c
index 328c708c..cf66c3bd 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -154,9 +154,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
   // collect segment and thread caches
-  if (force) {
-    _mi_segment_thread_collect(&heap->tld->segments);
-  }
+  _mi_segment_collect(force, &heap->tld->segments);
 
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
diff --git a/src/segment.c b/src/segment.c
index 91ff9adb..de413cc2 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -516,8 +516,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
 }
 
 // called by threads that are terminating to free cached segments
-void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
-  MI_UNUSED(tld);
+void _mi_segment_collect(bool force, mi_segments_tld_t* tld) {
+  MI_UNUSED(force); MI_UNUSED(tld);
 #if MI_DEBUG>=2
   if (!_mi_is_main_thread()) {
     mi_assert_internal(tld->pages_purge.first == NULL);

From 2b7530e1831237a88ecb8f1774b7cab019642792 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:33:17 -0700
Subject: [PATCH 089/119] add segment_collect for forced heap_collect

---
 include/mimalloc/internal.h |  2 +-
 src/heap.c                  |  7 ++++---
 src/segment.c               | 12 +++---------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 9b364f86..47aa46ff 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -155,7 +155,7 @@ void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi
 void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif
 
-void       _mi_segment_collect(bool force, mi_segments_tld_t* tld);
+void       _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
 bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
diff --git a/src/heap.c b/src/heap.c
index cf66c3bd..a878f32b 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -104,6 +104,10 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
     // still used blocks but the thread is done; abandon the page
     _mi_page_abandon(page, pq);
   }
+  if (collect == MI_FORCE) {
+    mi_segment_t* segment = _mi_page_segment(page);
+    _mi_segment_collect(segment, true /* force? */, &heap->tld->segments);
+  }
   return true; // don't break
 }
 
@@ -153,9 +157,6 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
-  // collect segment and thread caches
-  _mi_segment_collect(force, &heap->tld->segments);
-
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
     _mi_thread_data_collect();  // collect thread data cache
diff --git a/src/segment.c b/src/segment.c
index de413cc2..8ea76bfe 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -515,15 +515,9 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
 
-// called by threads that are terminating to free cached segments
-void _mi_segment_collect(bool force, mi_segments_tld_t* tld) {
-  MI_UNUSED(force); MI_UNUSED(tld);
-#if MI_DEBUG>=2
-  if (!_mi_is_main_thread()) {
-    mi_assert_internal(tld->pages_purge.first == NULL);
-    mi_assert_internal(tld->pages_purge.last == NULL);
-  }
-#endif
+// called from `heap_collect`. This can be called per-page.
+void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  MI_UNUSED(segment); MI_UNUSED(force); MI_UNUSED(tld);
 }
 
 

From bf5932c3c6ca52184690f88f0d4815c92a3ad42b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:43:05 -0700
Subject: [PATCH 090/119] use better purging for segments_collect

---
 include/mimalloc/internal.h |  4 ++--
 src/arena.c                 |  4 ++--
 src/heap.c                  |  9 ++++-----
 src/segment.c               | 14 +++++++-------
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 47aa46ff..c944a126 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -124,7 +124,7 @@ void*      _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_
 void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld);
 bool       _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
 bool       _mi_arena_contains(const void* p);
-void       _mi_arena_collect(bool force_purge, mi_stats_t* stats);
+void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
 bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
@@ -155,7 +155,7 @@ void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi
 void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
 #endif
 
-void       _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld);
+void       _mi_segments_collect(bool force, mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
 bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
diff --git a/src/arena.c b/src/arena.c
index b78d69f8..6e5e21b7 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -708,7 +708,7 @@ static void mi_arenas_unsafe_destroy(void) {
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
   mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
 }
 
@@ -716,7 +716,7 @@ void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
   mi_arenas_unsafe_destroy();
-  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
diff --git a/src/heap.c b/src/heap.c
index a878f32b..f7f6c8e8 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -104,10 +104,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t
     // still used blocks but the thread is done; abandon the page
     _mi_page_abandon(page, pq);
   }
-  if (collect == MI_FORCE) {
-    mi_segment_t* segment = _mi_page_segment(page);
-    _mi_segment_collect(segment, true /* force? */, &heap->tld->segments);
-  }
   return true; // don't break
 }
 
@@ -157,13 +153,16 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
   mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
 
+  // collect segments (purge pages, this can be expensive so don't force on abandonment)
+  _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments);
+
   // if forced, collect thread data cache on program-exit (or shared library unload)
   if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
     _mi_thread_data_collect();  // collect thread data cache
   }
   
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arena_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats);  
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
diff --git a/src/segment.c b/src/segment.c
index 8ea76bfe..4bb4aaf5 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -362,14 +362,14 @@ static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge
   }
 }
 
-static void mi_pages_try_purge(mi_segments_tld_t* tld) {
+static void mi_pages_try_purge(bool force, mi_segments_tld_t* tld) {
   if (mi_option_get(mi_option_purge_delay) < 0) return;  // purging is not allowed
 
   mi_msecs_t now = _mi_clock_now();
   mi_page_queue_t* pq = &tld->pages_purge;
   // from oldest up to the first that has not expired yet
   mi_page_t* page = pq->last;
-  while (page != NULL && mi_page_purge_is_expired(page,now)) {
+  while (page != NULL && (force || mi_page_purge_is_expired(page,now))) {
     mi_page_t* const prev = page->prev; // save previous field
     mi_page_purge_remove(page, tld);    // remove from the list to maintain invariant for mi_page_purge
     mi_page_purge(_mi_page_segment(page), page, tld);
@@ -515,9 +515,9 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
 
-// called from `heap_collect`. This can be called per-page.
-void _mi_segment_collect(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
-  MI_UNUSED(segment); MI_UNUSED(force); MI_UNUSED(tld);
+// called from `heap_collect`. 
+void _mi_segments_collect(bool force, mi_segments_tld_t* tld) {
+  mi_pages_try_purge(force,tld);
 }
 
 
@@ -734,7 +734,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
   mi_assert(page != NULL);
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  mi_pages_try_purge(tld);
+  mi_pages_try_purge(false /*force?*/, tld);
 
   // mark it as free now
   mi_segment_page_clear(segment, page, tld);
@@ -793,7 +793,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
 
   // Potentially force purge. Only abandoned segments in arena memory can be
   // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative.
-  mi_pages_try_purge(tld);
+  mi_pages_try_purge(false /*force?*/,tld);
   const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) ||  mi_option_is_enabled(mi_option_abandoned_page_purge);
   mi_segment_remove_all_purges(segment, force_purge, tld);
 

From 09e91ec911f97b71814d5a394b8d51ce25243a52 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 12:44:23 -0700
Subject: [PATCH 091/119] add assert for segment purges

---
 src/segment.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/segment.c b/src/segment.c
index 4bb4aaf5..b3fc60ee 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -518,6 +518,12 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
 // called from `heap_collect`. 
 void _mi_segments_collect(bool force, mi_segments_tld_t* tld) {
   mi_pages_try_purge(force,tld);
+  #if MI_DEBUG>=2
+  if (!_mi_is_main_thread()) {
+    mi_assert_internal(tld->pages_purge.first == NULL);
+    mi_assert_internal(tld->pages_purge.last == NULL);
+  }
+  #endif
 }
 
 

From 9830c0db4269b47350c71f2dc49653e28feae3f6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 13:31:31 -0700
Subject: [PATCH 092/119] redefine mi_nothrow_t to be a pointer to (probably)
 fix issue #840 to match WASI signatures

---
 src/alloc-override.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index 92536976..c44bb0d9 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -23,7 +23,7 @@ mi_decl_externc size_t malloc_good_size(size_t size);
 #endif
 
 // helper definition for C override of C++ new
-typedef struct mi_nothrow_s { int _tag; } mi_nothrow_t;
+typedef void* mi_nothrow_t;
 
 // ------------------------------------------------------
 // Override system malloc

From f7df734c6f8d08afa977ca2687058a3df533c64c Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 19 Apr 2024 13:44:29 -0700
Subject: [PATCH 093/119] add further C++ delete signatures to override from C
 (issue #863)

---
 src/alloc-override.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index c44bb0d9..75afc202 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -199,11 +199,17 @@ typedef void* mi_nothrow_t;
   void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p) // delete[]
   void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
   void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n)
+  
   void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
   void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
   void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
 
+  void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete(void*, std::nothrow_t const&) 
+  void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag)      { MI_UNUSED(tag); mi_free(p); }  // operator delete[](void*, std::nothrow_t const&)
+  void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&) 
+  void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&) 
+  
   #if (MI_INTPTR_SIZE==8)
     void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n)  // new 64-bit
     void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n)  // new[] 64-bit

From 336f83fbd1ca67bb6edba3a1c99e9b0230159336 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 16:09:45 -0700
Subject: [PATCH 094/119] use __builtin_thread_pointer on arm64 with older gcc
 compilers (issue #851)

---
 include/mimalloc/prim.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index d14b885b..ebb31df2 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -203,12 +203,23 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 #endif
 
+// Do we have __builtin_thread_pointer? (do not make this a compound test as it fails on older gcc's, see issue #851)
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_thread_pointer)
+#define MI_HAS_BUILTIN_THREAD_POINTER  1
+#endif
+#elif defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__)  // special case aarch64 for older gcc versions (issue #851)
+#define MI_HAS_BUILTIN_THREAD_POINTER  1
+#endif
+
+
 // defined in `init.c`; do not use these directly
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
 extern bool _mi_process_is_initialized;             // has mi_process_init been called?
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 
+// Get a unique id for the current thread.
 #if defined(_WIN32)
 
 #define WIN32_LEAN_AND_MEAN
@@ -218,7 +229,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)NtCurrentTeb();
 }
 
-#elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+#elif MI_HAS_BUILTIN_THREAD_POINTER && \
       (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
       (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
 

From e46c1145a510807e3b22b078ec70c1d38f4f9fbc Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 16:19:59 -0700
Subject: [PATCH 095/119] add separate MI_LIBC_MUSL option (issue #644)

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c0f67af..a6c95dc4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ option(MI_OSX_INTERPOSE     "Use interpose to override standard malloc on macOS"
 option(MI_OSX_ZONE          "Use malloc zone to override standard malloc on macOS" ON)
 option(MI_WIN_REDIRECT      "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON)
 option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF)
+option(MI_LIBC_MUSL         "Set this when linking with musl libc" OFF)
 option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
@@ -286,6 +287,12 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
   endif()
 endif()
 
+if(MI_LIBC_MUSL)
+  message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON) (this implies MI_LOCAL_DYNAMIC_TLS=ON)")
+  set(MI_LOCAL_DYNAMIC_TLS "ON")
+  list(APPEND mi_defines MI_LIBC_MUSL=1)
+endif()
+
 # On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
 # if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
 #   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)

From 79ab7c63d7e86a063f547b72549a05c73e03b841 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 16:37:09 -0700
Subject: [PATCH 096/119] disable transparent huge pages for a process too if
 the allow_large_os_pages option is set to false

---
 src/prim/unix/prim.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 9c4ecd4b..a7812cb6 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -148,13 +148,20 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
   config->has_virtual_reserve = true; // todo: check if this true for NetBSD?  (for anonymous mmap with PROT_NONE)
 
   // disable transparent huge pages for this process?
-  #if defined(MI_NO_THP) && (defined(__linux__) || defined(__ANDROID__))
-  int val = 0;
-  if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
-    // Most likely since distros often come with always/madvise settings.
-    val = 1;
-    // Disabling only for mimalloc process rather than touching system wide settings
-    (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+  #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE)
+  #if defined(MI_NO_THP)
+  if (true)
+  #else
+  if (!mi_option_is_enabled(mi_option_allow_large_os_pages)) // disable THP also if large OS pages are not allowed in the options
+  #endif
+  {
+    int val = 0;
+    if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) {
+      // Most likely since distros often come with always/madvise settings.
+      val = 1;
+      // Disabling only for mimalloc process rather than touching system wide settings
+      (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0);
+    }
   }
   #endif
 }

From 7247b9e3269af797d3fc05efd6da6f0d73c81ee8 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 16:45:49 -0700
Subject: [PATCH 097/119] allow configuring page and segment sizes (pr #753 and
 pr #862)

---
 include/mimalloc/types.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 5d219d68..761c7278 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -160,10 +160,18 @@ typedef int32_t  mi_ssize_t;
 
 // Main tuning parameters for segment and page sizes
 // Sizes for 64-bit, divide by two for 32-bit
+#ifndef MI_SMALL_PAGE_SHIFT
 #define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
+#endif
+#ifndef MI_MEDIUM_PAGE_SHIFT
 #define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
+#endif
+#ifndef MI_LARGE_PAGE_SHIFT
 #define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
+#endif
+#ifndef MI_SEGMENT_SHIFT
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
+#endif
 
 // Derived constants
 #define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)

From 204348e468155d988f1e5a59d3ee6503a34d2162 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 16:58:58 -0700
Subject: [PATCH 098/119] only define WIN32_LEAN_AND_MEAN if needed

---
 include/mimalloc/atomic.h | 4 ++++
 include/mimalloc/prim.h   | 2 ++
 include/mimalloc/track.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index f4bde7f4..807c4da8 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -133,7 +133,9 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 #elif defined(_MSC_VER)
 
 // MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <intrin.h>
 #ifdef _WIN64
@@ -327,7 +329,9 @@ static inline void mi_atomic_yield(void) {
   std::this_thread::yield();
 }
 #elif defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index ebb31df2..f8a40323 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -222,7 +222,9 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
 // Get a unique id for the current thread.
 #if defined(_WIN32)
 
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
diff --git a/include/mimalloc/track.h b/include/mimalloc/track.h
index 9545f750..a659d940 100644
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@@ -82,7 +82,9 @@ defined, undefined, or not accessible at all:
 #define MI_TRACK_HEAP_DESTROY 1
 #define MI_TRACK_TOOL         "ETW"
 
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include "../src/prim/windows/etw.h"
 

From 88aa84727d42dd3d237b6a2fc5b3f08afb304e75 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 17:12:09 -0700
Subject: [PATCH 099/119] fix spelling errors (pr #710)

---
 doc/doxyfile       |  2 +-
 doc/mimalloc-doc.h | 10 +++++-----
 docs/bench.html    |  2 +-
 readme.md          |  4 ++--
 src/arena.c        |  4 ++--
 src/bitmap.h       |  2 +-
 src/free.c         |  2 +-
 src/heap.c         |  2 +-
 src/page.c         |  2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/doc/doxyfile b/doc/doxyfile
index 55cae8bf..d03a70f5 100644
--- a/doc/doxyfile
+++ b/doc/doxyfile
@@ -466,7 +466,7 @@ LOOKUP_CACHE_SIZE      = 0
 # than 0 to get more control over the balance between CPU load and processing
 # speed. At this moment only the input processing can be done using multiple
 # threads. Since this is still an experimental feature the default is set to 1,
-# which efficively disables parallel processing. Please report any issues you
+# which effectively disables parallel processing. Please report any issues you
 # encounter. Generating dot graphs in parallel is controlled by the
 # DOT_NUM_THREADS setting.
 # Minimum value: 0, maximum value: 32, default value: 1.
diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 47a8a6b9..f7f358a7 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -441,7 +441,7 @@ bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_la
 /// @param pages The number of 1GiB pages to reserve.
 /// @param numa_nodes The number of nodes do evenly divide the pages over, or 0 for using the actual number of NUMA nodes.
 /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout.
-/// @returns 0 if successfull, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
+/// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
 ///
 /// The reserved memory is used by mimalloc to satisfy allocations.
 /// May quit before \a timeout_msecs are expired if it estimates it will take more than
@@ -455,7 +455,7 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
 /// @param pages The number of 1GiB pages to reserve.
 /// @param numa_node The NUMA node where the memory is reserved (start at 0).
 /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout.
-/// @returns 0 if successfull, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
+/// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out.
 ///
 /// The reserved memory is used by mimalloc to satisfy allocations.
 /// May quit before \a timeout_msecs are expired if it estimates it will take more than
@@ -468,7 +468,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
 /// Is the C runtime \a malloc API redirected?
 /// @returns \a true if all malloc API calls are redirected to mimalloc.
 ///
-/// Currenty only used on Windows.
+/// Currently only used on Windows.
 bool mi_is_redirected();
 
 /// Return process information (time and memory usage).
@@ -558,7 +558,7 @@ mi_heap_t* mi_heap_new();
 
 /// Delete a previously allocated heap.
 /// This will release resources and migrate any
-/// still allocated blocks in this heap (efficienty)
+/// still allocated blocks in this heap (efficiently)
 /// to the default heap.
 ///
 /// If \a heap is the default heap, the default
@@ -888,7 +888,7 @@ void mi_free_aligned(void* p, size_t alignment);
 ///
 ///  Note: use the `mimalloc-new-delete.h` header to override the \a new
 ///        and \a delete operators globally. The wrappers here are mostly
-///        for convience for library writers that need to interface with
+///        for convenience for library writers that need to interface with
 ///        mimalloc from C++.
 ///
 /// \{
diff --git a/docs/bench.html b/docs/bench.html
index d54f5fd6..213ff24b 100644
--- a/docs/bench.html
+++ b/docs/bench.html
@@ -100,7 +100,7 @@ $(document).ready(function(){initNavTree('bench.html',''); initResizable(); });
 <div class="contents">
 <div class="textblock"><p>We tested <em>mimalloc</em> against many other top allocators over a wide range of benchmarks, ranging from various real world programs to synthetic benchmarks that see how the allocator behaves under more extreme circumstances.</p>
 <p>In our benchmarks, <em>mimalloc</em> always outperforms all other leading allocators (<em>jemalloc</em>, <em>tcmalloc</em>, <em>Hoard</em>, etc) (Jan 2021), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does <em>consistently</em> well over the wide range of benchmarks.</p>
-<p>See the <a href="https://github.com/microsoft/mimalloc#Performance">Performance</a> section in the <em>mimalloc</em> repository for benchmark results, or the the technical report for detailed benchmark results. </p>
+<p>See the <a href="https://github.com/microsoft/mimalloc#Performance">Performance</a> section in the <em>mimalloc</em> repository for benchmark results, or the technical report for detailed benchmark results. </p>
 </div></div><!-- contents -->
 </div><!-- PageDoc -->
 </div><!-- doc-content -->
diff --git a/readme.md b/readme.md
index 2772bcb7..4dea1086 100644
--- a/readme.md
+++ b/readme.md
@@ -91,7 +91,7 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
   abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes.
 
 * 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support.
-  Support abitrary large alignments (in particular for `std::pmr` pools). 
+  Support arbitrary large alignments (in particular for `std::pmr` pools). 
   Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). 
   Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). 
   Various small bug fixes.
@@ -224,7 +224,7 @@ target_link_libraries(myapp PUBLIC mimalloc-static)
 to link with the static library. See `test\CMakeLists.txt` for an example.
 
 For best performance in C++ programs, it is also recommended to override the
-global `new` and `delete` operators. For convience, mimalloc provides
+global `new` and `delete` operators. For convenience, mimalloc provides
 [`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
 In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
 interface.
diff --git a/src/arena.c b/src/arena.c
index 6e5e21b7..fccab871 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -630,12 +630,12 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
 
     // checks
     if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
     mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
     if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
-      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
       return;
     }
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 156c4386..a1e7686a 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`size_t`)
+represented as an array of fields where each field is a machine word (`size_t`)
 
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
diff --git a/src/free.c b/src/free.c
index 43e1f76f..c065d2f3 100644
--- a/src/free.c
+++ b/src/free.c
@@ -372,7 +372,7 @@ static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block
   if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
       (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
   {
-    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // Suspicious: decoded value a in block is in the same page (or NULL) -- maybe a double free?
     // (continue in separate function to improve code generation)
     is_double_free = mi_check_is_double_freex(page, block);
   }
diff --git a/src/heap.c b/src/heap.c
index f7f6c8e8..772a6051 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -415,7 +415,7 @@ void mi_heap_delete(mi_heap_t* heap)
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
   if (!mi_heap_is_backing(heap)) {
-    // tranfer still used pages to the backing heap
+    // transfer still used pages to the backing heap
     mi_heap_absorb(heap->tld->heap_backing, heap);
   }
   else {
diff --git a/src/page.c b/src/page.c
index 3b3ba2f9..7e188522 100644
--- a/src/page.c
+++ b/src/page.c
@@ -460,7 +460,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
       if (index < heap->page_retired_min) heap->page_retired_min = index;
       if (index > heap->page_retired_max) heap->page_retired_max = index;
       mi_assert_internal(mi_page_all_free(page));
-      return; // dont't free after all
+      return; // don't free after all
     }
   }
 

From ce783df58f2546e52d89df3411bb9980f7716c9e Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 20 Apr 2024 17:18:09 -0700
Subject: [PATCH 100/119] fix spelling errors (pr #710)

---
 src/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bitmap.c b/src/bitmap.c
index 017295e7..976ba72c 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 /* ----------------------------------------------------------------------------
 Concurrent bitmap that can set/reset sequences of bits atomically,
-represeted as an array of fields where each field is a machine word (`size_t`)
+represented as an array of fields where each field is a machine word (`size_t`)
 
 There are two api's; the standard one cannot have sequences that cross
 between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).

From a527f751619303c89c8967f64ed75015839d0a56 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 10:10:20 -0700
Subject: [PATCH 101/119] add disallow_arena_alloc option

---
 include/mimalloc.h | 52 ++++++++++++++++++++++++----------------------
 src/arena.c        | 30 +++++++++++++-------------
 src/options.c      |  7 ++++---
 3 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index b3f60a34..9848d531 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -317,41 +317,43 @@ mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size
 
 typedef enum mi_option_e {
   // stable options
-  mi_option_show_errors,              // print error messages
-  mi_option_show_stats,               // print statistics on termination
-  mi_option_verbose,                  // print verbose messages
-  // the following options are experimental (see src/options.h)
-  mi_option_eager_commit,             // eager commit segments? (after `eager_commit_delay` segments) (=1)
-  mi_option_arena_eager_commit,       // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
-  mi_option_purge_decommits,          // should a memory purge decommit (or only reset) (=1)
-  mi_option_allow_large_os_pages,     // allow large (2MiB) OS pages, implies eager commit
-  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB/page) at startup
-  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
-  mi_option_reserve_os_memory,        // reserve specified amount of OS memory in an arena at startup
+  mi_option_show_errors,                // print error messages
+  mi_option_show_stats,                 // print statistics on termination
+  mi_option_verbose,                    // print verbose messages
+  // advanced options
+  mi_option_eager_commit,               // eager commit segments? (after `eager_commit_delay` segments) (=1)
+  mi_option_arena_eager_commit,         // eager commit arenas? Use 2 to enable just on overcommit systems (=2)
+  mi_option_purge_decommits,            // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit)
+  mi_option_allow_large_os_pages,       // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process.
+  mi_option_reserve_huge_os_pages,      // reserve N huge OS pages (1GiB pages) at startup
+  mi_option_reserve_huge_os_pages_at,   // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,          // reserve specified amount of OS memory in an arena at startup
   mi_option_deprecated_segment_cache,
   mi_option_deprecated_page_reset,
-  mi_option_abandoned_page_purge,     // immediately purge delayed purges on thread termination
+  mi_option_abandoned_page_purge,       // immediately purge delayed purges on thread termination
   mi_option_deprecated_segment_reset, 
-  mi_option_eager_commit_delay,       
-  mi_option_purge_delay,              // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all.
-  mi_option_use_numa_nodes,           // 0 = use all available numa nodes, otherwise use at most N nodes.
-  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
-  mi_option_os_tag,                   // tag used for OS logging (macOS only for now)
-  mi_option_max_errors,               // issue at most N error messages
-  mi_option_max_warnings,             // issue at most N warning messages
-  mi_option_max_segment_reclaim,      
-  mi_option_destroy_on_exit,          // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe.
-  mi_option_arena_reserve,            // initial memory size in KiB for arena reservation (1GiB on 64-bit)
-  mi_option_arena_purge_mult,         
+  mi_option_eager_commit_delay,         // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  mi_option_purge_delay,                // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10)
+  mi_option_use_numa_nodes,             // 0 = use all available numa nodes, otherwise use at most N nodes.
+  mi_option_disallow_os_alloc,          // 1 = do not use OS memory for allocation (but only programmatically reserved arenas)
+  mi_option_os_tag,                     // tag used for OS logging (macOS only for now) (=100)
+  mi_option_max_errors,                 // issue at most N error messages
+  mi_option_max_warnings,               // issue at most N warning messages
+  mi_option_max_segment_reclaim,        // max. percentage of the abandoned segments can be reclaimed per try (=10%)
+  mi_option_destroy_on_exit,            // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe
+  mi_option_arena_reserve,              // initial memory size in KiB for arena reservation (= 1 GiB on 64-bit)
+  mi_option_arena_purge_mult,           // multiplier for `purge_delay` for the purging delay for arenas (=10)
   mi_option_purge_extend_delay,
-  mi_option_abandoned_reclaim_on_free,  // reclaim abandoned segments on a free
+  mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
+  mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
   mi_option_eager_region_commit = mi_option_arena_eager_commit,
   mi_option_reset_decommits = mi_option_purge_decommits,
   mi_option_reset_delay = mi_option_purge_delay,
-  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge
+  mi_option_abandoned_page_reset = mi_option_abandoned_page_purge,
+  mi_option_limit_os_alloc = mi_option_disallow_os_alloc
 } mi_option_t;
 
 
diff --git a/src/arena.c b/src/arena.c
index fccab871..511fe2fd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -353,7 +353,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (arena_reserve == 0) return false;
 
   if (!_mi_os_has_virtual_reserve()) {
-    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for some embedded systems for example)
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
@@ -366,7 +366,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0);
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
 }
 
 
@@ -380,24 +380,26 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
   const int numa_node = _mi_os_numa_node(tld); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
-    void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-    if (p != NULL) return p;
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+      if (p != NULL) return p;
 
-    // otherwise, try to first eagerly reserve a new arena
-    if (req_arena_id == _mi_arena_id_none()) {
-      mi_arena_id_t arena_id = 0;
-      if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-        // and try allocate in there
-        mi_assert_internal(req_arena_id == _mi_arena_id_none());
-        p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-        if (p != NULL) return p;
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
+          if (p != NULL) return p;
+        }
       }
     }
   }
 
   // if we cannot use OS allocation, return NULL
-  if (mi_option_is_enabled(mi_option_limit_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
diff --git a/src/options.c b/src/options.c
index f8e928d0..78e9377c 100644
--- a/src/options.c
+++ b/src/options.c
@@ -65,7 +65,7 @@ static mi_option_desc_t options[_mi_option_last] =
   { 0, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) },    // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
   { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },      // per 1GiB huge pages
   {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) },   // reserve huge pages at node N
-  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },      // reserve OS memory in advance
   { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },   // cache N segments per thread
   { 0, UNINIT, MI_OPTION(deprecated_page_reset) },      // reset page memory on free
   { 0, UNINIT, MI_OPTION(abandoned_page_purge) },       // purge free page memory when a thread terminates
@@ -77,7 +77,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #endif
   { 10,  UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) },  // purge delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },           // 0 = use available numa nodes, otherwise use at most N nodes.
-  { 0,   UNINIT, MI_OPTION(limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 0,   UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) },           // 1 = do not use OS memory for allocation (but only reserved arenas)
   { 100, UNINIT, MI_OPTION(os_tag) },                   // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) },               // maximum errors that are output
   { 16,  UNINIT, MI_OPTION(max_warnings) },             // maximum warnings that are output
@@ -91,7 +91,8 @@ static mi_option_desc_t options[_mi_option_last] =
 
   { 10,  UNINIT, MI_OPTION(arena_purge_mult) },         // purge delay multiplier for arena's
   { 1,   UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) },
-  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) }, // reclaim an abandoned segment on a free
+  { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
+  { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
 };
 
 static void mi_option_init(mi_option_desc_t* desc);

From c469e3d519e2b5da8876d22d644240f67b47933f Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 10:32:53 -0700
Subject: [PATCH 102/119] add release notes

---
 readme.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/readme.md b/readme.md
index 4dea1086..00e1c389 100644
--- a/readme.md
+++ b/readme.md
@@ -80,6 +80,13 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
   and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
   (see [below](#performance)); please report if you observe any significant performance regression.
 
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Improved performance on aligned allocation. 
+  Free-ing code is refactored into a separate module (`free.c`). New approach to collection of abandoned segments: When
+  a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
+  reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's 
+  which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`).
+
 * 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
   by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
   usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
@@ -298,8 +305,9 @@ Further options for large workloads and services:
    at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
-- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
-   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
+   improve performance. When this option is disabled it also disables transparent huge pages (THP) for the process 
+   (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
    to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
    can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   

From 96819a3f1d8262b900dbbce5e4d2b631a3ede7e6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 11:00:42 -0700
Subject: [PATCH 103/119] update readme

---
 readme.md | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/readme.md b/readme.md
index 00e1c389..b92c7d33 100644
--- a/readme.md
+++ b/readme.md
@@ -80,12 +80,15 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
   and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
   (see [below](#performance)); please report if you observe any significant performance regression.
 
-* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Improved performance on aligned allocation. 
-  Free-ing code is refactored into a separate module (`free.c`). New approach to collection of abandoned segments: When
+* 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds.
+  Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size
+  directly available (and new `block_size_shift` to improve aligned block free-ing). 
+  New approach to collection of abandoned segments: When
   a thread terminates the segments it owns are abandoned (containing still live objects) and these can be
   reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's 
   which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in
-  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`).
+  an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim`
+  gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%).
 
 * 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
   by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
@@ -151,7 +154,7 @@ mimalloc is used in various large scale low-latency services and programs, for e
 
 ## Windows
 
-Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build.
+Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
 The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
 `mimalloc-override` project builds a DLL for overriding malloc
 in the entire program.
@@ -287,17 +290,23 @@ You can set further options either programmatically (using [`mi_option_set`](htt
 
 Advanced options:
 
+- `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
+   allocates segments and pages. Set this to 2 (default) to 
+   only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems 
+   as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). 
+   Note that eager commit only increases the commit but not the actual the peak resident set 
+   (rss) so it is generally ok to enable this.
 - `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge 
    OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which 
    can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
    a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
    value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
-   Setting it to `-1` disables purging completely.   
-- `MIMALLOC_ARENA_EAGER_COMMIT=1`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
-   allocates segments and pages. This is by default 
-   only enabled on overcommit systems (e.g. Linux) but enabling it explicitly on other systems (like Windows or macOS)
-   may improve performance. Note that eager commit only increases the commit but not the actual the peak resident set 
-   (rss) so it is generally ok to enable this.
+   Setting it to `-1` disables purging completely.
+- `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows,
+   `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused
+   memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems).
+   Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual 
+   address ranges and decommits within those ranges (to make the underlying physical memory available to other processes).
 
 Further options for large workloads and services:
 
@@ -306,9 +315,9 @@ Further options for large workloads and services:
    the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
    nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
 - `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly
-   improve performance. When this option is disabled it also disables transparent huge pages (THP) for the process 
+   improve performance. When this option is disabled, it also disables transparent huge pages (THP) for the process 
    (on Linux and Android). Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
-   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+   to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
    the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
    can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   
 - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
@@ -317,11 +326,12 @@ Further options for large workloads and services:
    OS pages, use with care as reserving
    contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
    startup only once).
-   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
+   Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
    With huge OS pages, it may be beneficial to set the setting
    `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
    of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be purged).
+   and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned
+   to physical memory).
    The huge pages are usually allocated evenly among NUMA nodes.
    We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
    the huge pages at a specific numa node instead.

From 3324e8c1e7c1e2c48d0f4a7ff40b5366175f5363 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 11:07:02 -0700
Subject: [PATCH 104/119] update readme

---
 readme.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/readme.md b/readme.md
index b92c7d33..5ca499d6 100644
--- a/readme.md
+++ b/readme.md
@@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.1.2` (2023-04-24).
-Latest stable  tag: `v1.8.2` (2023-04-24).
+Latest release tag: `v2.1.4` (2024-04-22).
+Latest stable  tag: `v1.8.4` (2024-04-22).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:

From 6b6607ff5906be4afdc768c8c0676b595ec1afe2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 11:16:27 -0700
Subject: [PATCH 105/119] update readme

---
 readme.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/readme.md b/readme.md
index 5ca499d6..05268a87 100644
--- a/readme.md
+++ b/readme.md
@@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release tag: `v2.1.4` (2024-04-22).
-Latest stable  tag: `v1.8.4` (2024-04-22).
+Latest release tag: `v2.1.4` (2024-04-22).  
+Latest v1 tag: `v1.8.4` (2024-04-22).
 
 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@@ -72,11 +72,13 @@ Enjoy!
 
 * `master`: latest stable release (based on `dev-slice`).
 * `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
-* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev`.
+* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for
+`src/segment.c`)
 
 ### Releases
 
-Note: the `v2.x` version has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage
+Note: the `v2.x` version has a different algorithm for managing internal mimalloc pages (as slices) that tends to use reduce 
+memory usage
   and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
   (see [below](#performance)); please report if you observe any significant performance regression.
 

From 77eb3a366b0355a0a90e2a0a8b6ac03757b4a1f6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Mon, 22 Apr 2024 11:25:26 -0700
Subject: [PATCH 106/119] bump version to 1.8.5 for further development

---
 cmake/mimalloc-config-version.cmake | 2 +-
 include/mimalloc.h                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index e9b7d113..af85d979 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,6 +1,6 @@
 set(mi_version_major 1)
 set(mi_version_minor 8)
-set(mi_version_patch 4)
+set(mi_version_patch 5)
 set(mi_version ${mi_version_major}.${mi_version_minor})
 
 set(PACKAGE_VERSION ${mi_version})
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 9848d531..7523c8a2 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H
 
-#define MI_MALLOC_VERSION 184   // major + 2 digits minor
+#define MI_MALLOC_VERSION 185   // major + 2 digits minor
 
 // ------------------------------------------------------
 // Compiler specific attributes

From 146f9d2333bc0aeed1593dc659b17e65b38a33e7 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Wed, 24 Apr 2024 19:48:04 -0700
Subject: [PATCH 107/119] make TLS slot default instead of
 __builtin_thread_pointer. Potentially fixes build errors on various platforms
 (see issue #883)

---
 include/mimalloc/prim.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index f8a40323..6063d44a 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -203,11 +203,14 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 #endif
 
-// Do we have __builtin_thread_pointer? (do not make this a compound test as it fails on older gcc's, see issue #851)
-#if defined(__has_builtin)
-#if __has_builtin(__builtin_thread_pointer)
-#define MI_HAS_BUILTIN_THREAD_POINTER  1
-#endif
+// Do we have __builtin_thread_pointer? (do not make this a compound test as that fails on older gcc's, see issue #851)
+#ifdef __has_builtin
+  #if __has_builtin(__builtin_thread_pointer)
+    #if (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+        (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
+      #define MI_HAS_BUILTIN_THREAD_POINTER  1
+    #endif
+  #endif
 #elif defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__)  // special case aarch64 for older gcc versions (issue #851)
 #define MI_HAS_BUILTIN_THREAD_POINTER  1
 #endif
@@ -231,15 +234,6 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)NtCurrentTeb();
 }
 
-#elif MI_HAS_BUILTIN_THREAD_POINTER && \
-      (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
-      (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  // Works on most Unix based platforms
-  return (uintptr_t)__builtin_thread_pointer();  
-}
-
 #elif defined(MI_HAS_TLS_SLOT)
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
@@ -255,6 +249,13 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #endif
 }
 
+#elif MI_HAS_BUILTIN_THREAD_POINTER 
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms
+  return (uintptr_t)__builtin_thread_pointer();  
+}
+
 #else
 
 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).

From cc3c14f2ed374f908e60a3bf29c1dff84fc8cfc2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 26 Apr 2024 23:34:10 -0700
Subject: [PATCH 108/119] use builtin_thread_pointer only on non-apple arm64
 (issue #883 and #851)

---
 include/mimalloc/prim.h | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 6063d44a..89266817 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -203,16 +203,14 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
 
 #endif
 
-// Do we have __builtin_thread_pointer? (do not make this a compound test as that fails on older gcc's, see issue #851)
-#ifdef __has_builtin
-  #if __has_builtin(__builtin_thread_pointer)
-    #if (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
-        (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
-      #define MI_HAS_BUILTIN_THREAD_POINTER  1
-    #endif
-  #endif
-#elif defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__)  // special case aarch64 for older gcc versions (issue #851)
-#define MI_HAS_BUILTIN_THREAD_POINTER  1
+// Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
+// but unfortunately, it seems we cannot test for this reliably at this time (see issue #883)
+// Nevertheless, it seems needed on older graviton platforms (see issue #851).
+// For now, we only enable this for specific platforms.
+#if defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__) /* special case aarch64 for older gcc versions (issue #851) */ \
+    && !defined(__APPLE__)  /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
+    && (!defined(__clang_major__) || __clang_major__ >= 14)  /* older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>) */
+#define MI_USE_BUILTIN_THREAD_POINTER  1
 #endif
 
 
@@ -234,6 +232,13 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   return (uintptr_t)NtCurrentTeb();
 }
 
+#elif MI_USE_BUILTIN_THREAD_POINTER 
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms with recent compilers
+  return (uintptr_t)__builtin_thread_pointer();  
+}
+
 #elif defined(MI_HAS_TLS_SLOT)
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
@@ -249,13 +254,6 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #endif
 }
 
-#elif MI_HAS_BUILTIN_THREAD_POINTER 
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  // Works on most Unix based platforms
-  return (uintptr_t)__builtin_thread_pointer();  
-}
-
 #else
 
 // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).

From 0d22807e91f3bc416cb70bcb0a39b0f05bc515eb Mon Sep 17 00:00:00 2001
From: Alon Zakai <azakai@google.com>
Date: Tue, 7 May 2024 13:22:13 -0700
Subject: [PATCH 109/119] Emscripten: Remove no-longer-needed minimum emmalloc
 alignment

---
 src/prim/emscripten/prim.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index c0fa0f4a..8b011b4d 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -78,17 +78,10 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
   //       That assumes no one else uses sbrk but us (they could go up,
   //       scribble, and then down), but we could assert on that perhaps.
   *is_zero = false;
-  // emmalloc has some limitations on alignment size.
-  // TODO: Why does mimalloc ask for an align of 4MB? that ends up allocating
-  //       8, which wastes quite a lot for us in wasm. If that is unavoidable,
-  //       we may want to improve emmalloc to support such alignment. See also
-  //       https://github.com/emscripten-core/emscripten/issues/20645
+  // emmalloc has a minimum alignment size.
   #define MIN_EMMALLOC_ALIGN           8
-  #define MAX_EMMALLOC_ALIGN (1024*1024)
   if (try_alignment < MIN_EMMALLOC_ALIGN) {
     try_alignment = MIN_EMMALLOC_ALIGN;
-  } else if (try_alignment > MAX_EMMALLOC_ALIGN) {
-    try_alignment = MAX_EMMALLOC_ALIGN;
   }
   void* p = emmalloc_memalign(try_alignment, size);
   *addr = p;

From 1ebc28a8ff169aa484c38a795b8f29ae6a983cdd Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 10 May 2024 15:58:37 -0700
Subject: [PATCH 110/119] update comment

---
 include/mimalloc/prim.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 89266817..4ee6d43f 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -130,8 +130,9 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // If you test on another platform and it works please send a PR :-)
 // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
 //
-// Note: on most platforms this is not actually used anymore as we prefer `__builtin_thread_pointer()` nowadays.
-// However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
+// Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly,
+// but unfortunately we can not detect support reliably (see issue #883)
+// We also use it on Apple OS as we use a TLS slot for the default heap there.
 #if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \

From e5267a31b03b5ada98b544a421c2a0fbd082bf39 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 10 May 2024 17:03:42 -0700
Subject: [PATCH 111/119] only override strdup/strndup if those are not macros
 (issue #885)

---
 src/alloc-override.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/alloc-override.c b/src/alloc-override.c
index 75afc202..12837cdd 100644
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@@ -136,8 +136,11 @@ typedef void* mi_nothrow_t;
   mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize)
   mi_decl_export void  free(void* p)                    MI_FORWARD0(mi_free, p)  
   // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call)
+  // We only override if `strdup` is not a macro (as on some older libc's, see issue #885)
+  #if !defined(strdup)
   mi_decl_export char* strdup(const char* str)             MI_FORWARD1(mi_strdup, str)
-  #if !defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)
+  #endif
+  #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7))
   mi_decl_export char* strndup(const char* str, size_t n)  MI_FORWARD2(mi_strndup, str, n)   
   #endif
 #endif

From 605c354bd43450679d617f56fcab0262ac58be11 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 10 May 2024 17:31:00 -0700
Subject: [PATCH 112/119] increase segment map for asan builds (issue #881)

---
 src/segment-map.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/segment-map.c b/src/segment-map.c
index a306ec67..1efb1e23 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -16,7 +16,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 
-#if (MI_INTPTR_SIZE==8)
+#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN
+#define MI_MAX_ADDRESS    ((size_t)140 << 40) // 140TB (see issue #881)
+#elif (MI_INTPTR_SIZE >= 8)
 #define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
 #else
 #define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb

From c70c1df16a48f214f877573ddfed68dc183ef113 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Fri, 10 May 2024 20:19:17 -0700
Subject: [PATCH 113/119] better fast path for aligned allocation; check max
 alloc size correctly in the aligned fallback

---
 src/alloc-aligned.c | 33 ++++++++++++++++-----------------
 test/test-api.c     | 17 +++++++++++++++++
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index b63c5e43..e5a42357 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -15,17 +15,23 @@ terms of the MIT license. A copy of the license can be found in the file
 // Aligned Allocation
 // ------------------------------------------------------
 
+static inline bool mi_is_naturally_aligned( size_t size, size_t alignment ) {
+  // objects up to `MI_MEDIUM_OBJ_SIZE_MAX` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
+  // note: the size may not be not an actual bin-size but it turns out the test below is still correct for our
+  // powers of two bin spacing (see test-api.c:test-aligned13).
+  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
+  return (size <= MI_MEDIUM_OBJ_SIZE_MAX && alignment <= size && ((size + MI_PADDING_SIZE) & (alignment-1)) == 0);
+}
+
+
 // Fallback primitive aligned allocation -- split out for better codegen
 static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
-  mi_assert_internal(size <= PTRDIFF_MAX);
+  mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE));
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
 
-  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
-  const size_t padsize = size + MI_PADDING_SIZE;
-
-  // use regular allocation if it is guaranteed to fit the alignment constraints
-  if (offset == 0 && alignment <= padsize && padsize <= MI_MEDIUM_OBJ_SIZE_MAX && (padsize & align_mask) == 0) {
+  // use regular allocation if it is guaranteed to fit the alignment constraints.
+  if (offset == 0 && mi_is_naturally_aligned(size,alignment)) {
     void* p = _mi_heap_malloc_zero(heap, size, zero);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
     return p;
@@ -57,6 +63,7 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   }
 
   // .. and align within the allocation
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
   const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
   const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
   mi_assert_internal(adjust < alignment);
@@ -100,14 +107,14 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
     return NULL;
   }
 
-  if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
   const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size  check
 
   // try first if there happens to be a small block available with just the right alignment
   if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) {
@@ -140,15 +147,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* he
 
 mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
   if (alignment == 0 || !_mi_is_power_of_two(alignment)) return NULL;
-  #if !MI_PADDING
-  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
-  if mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX)
-  #else
-  // with padding, we can only guarantee this for fixed alignments
-  if mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
-                && size <= MI_SMALL_SIZE_MAX)
-  #endif
-  {
+  if (size <= MI_SMALL_SIZE_MAX && mi_is_naturally_aligned(size,alignment)) {
     // fast path for common alignment and size
     return mi_heap_malloc_small(heap, size);
   }
diff --git a/test/test-api.c b/test/test-api.c
index 75955c49..34bfa0e6 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -230,6 +230,23 @@ int main(void) {
     result = (((uintptr_t)p % 0x100) == 0); // #602
     mi_free(p);
   }
+  CHECK_BODY("mimalloc-aligned13") {
+    bool ok = true;
+    for( size_t size = 1; size <= MI_SMALL_SIZE_MAX && ok; size++ ) {
+      for(size_t align = 1; align <= size && ok; align *= 2 ) {
+        void* p = mi_malloc_aligned(size,align);
+        ok = (p != NULL && ((uintptr_t)p % align) == 0);
+        mi_free(p);       
+        /*
+        if (ok && align <= size && ((size + MI_PADDING_SIZE) & (align-1)) == 0) {
+          size_t bsize = mi_good_size(size);
+          ok = (align <= bsize && ((bsize + MI_PADDING_SIZE) & (align-1)) == 0);
+        }
+        */
+      }
+    }
+    result = ok;
+  }
   CHECK_BODY("malloc-aligned-at1") {
     void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p);
   };

From 7128db7bbaf52dea5c68e88fd4f42c6ee02b9d60 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 06:43:52 -0700
Subject: [PATCH 114/119] simplified aligned allocation; improved codegen; fix
 mi_good_size with padding included; add MI_MAX_ALIGN_GUARANTEE

---
 include/mimalloc/internal.h |  4 +-
 include/mimalloc/types.h    |  3 ++
 src/alloc-aligned.c         | 75 +++++++++++++++++++------------------
 src/alloc.c                 | 22 +++++++----
 src/page-queue.c            |  4 +-
 src/page.c                  |  4 +-
 src/segment.c               | 12 +++---
 7 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index c944a126..688dba0b 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -196,7 +196,9 @@ mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
 
 // "alloc.c"
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;                  // called from `_mi_heap_malloc_aligned`
+void*       _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;           // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
 void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 761c7278..bccd6115 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -200,6 +200,9 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif
 
+// Maximum block size for which blocks are guarenteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
+#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
+
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index e5a42357..d80a6753 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -15,15 +15,15 @@ terms of the MIT license. A copy of the license can be found in the file
 // Aligned Allocation
 // ------------------------------------------------------
 
-static inline bool mi_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_MEDIUM_OBJ_SIZE_MAX` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
-  // note: the size may not be not an actual bin-size but it turns out the test below is still correct for our
-  // powers of two bin spacing (see test-api.c:test-aligned13).
+static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
+  // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`).
   mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
-  return (size <= MI_MEDIUM_OBJ_SIZE_MAX && alignment <= size && ((size + MI_PADDING_SIZE) & (alignment-1)) == 0);
+  if (alignment > size) return false;
+  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
+  const size_t bsize = mi_good_size(size);
+  return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
 }
 
-
 // Fallback primitive aligned allocation -- split out for better codegen
 static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@@ -31,10 +31,18 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
 
   // use regular allocation if it is guaranteed to fit the alignment constraints.
-  if (offset == 0 && mi_is_naturally_aligned(size,alignment)) {
+  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
     void* p = _mi_heap_malloc_zero(heap, size, zero);
     mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    return p;
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    if mi_likely(is_aligned_or_null) {
+      return p;
+    }
+    else {
+      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
+      mi_assert(false);
+      mi_free(p); 
+    }
   }
 
   void* p;
@@ -106,33 +114,35 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
     #endif
     return NULL;
   }
+  
+  // try first if there happens to be a small block available with just the right alignment
+  if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) {
+    const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+    const size_t padsize = size + MI_PADDING_SIZE;  
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    if mi_likely(page->free != NULL) {
+      const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
+      if mi_likely(is_aligned)
+      {
+        #if MI_STAT>1
+        mi_heap_stat_increase(heap, malloc, size);
+        #endif
+        void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
+        mi_assert_internal(p != NULL);
+        mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+        mi_track_malloc(p,size,zero);
+        return p;
+      }
+    }
+  }
 
+  // fallback
   if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
     #if MI_DEBUG > 0
     _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
     #endif
     return NULL;
   }
-  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
-  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size  check
-
-  // try first if there happens to be a small block available with just the right alignment
-  if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) {
-    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
-    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
-    if mi_likely(page->free != NULL && is_aligned)
-    {
-      #if MI_STAT>1
-      mi_heap_stat_increase(heap, malloc, size);
-      #endif
-      void* p = _mi_page_malloc(heap, page, padsize, zero); // TODO: inline _mi_page_malloc
-      mi_assert_internal(p != NULL);
-      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
-      mi_track_malloc(p,size,zero);
-      return p;
-    }
-  }
-  // fallback
   return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
 }
 
@@ -146,14 +156,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* he
 }
 
 mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
-  if (alignment == 0 || !_mi_is_power_of_two(alignment)) return NULL;
-  if (size <= MI_SMALL_SIZE_MAX && mi_is_naturally_aligned(size,alignment)) {
-    // fast path for common alignment and size
-    return mi_heap_malloc_small(heap, size);
-  }
-  else {
-    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
-  }
+  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
 }
 
 // ------------------------------------------------------
diff --git a/src/alloc.c b/src/alloc.c
index 32175b0c..ab30fd53 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -28,7 +28,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
 {
   mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
@@ -85,14 +85,14 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   #endif
 
   #if MI_PADDING // && !MI_TRACK_ENABLED
-  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+    mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+    ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
     #if (MI_DEBUG>=2)
     mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
     #endif
-  mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
-  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
-  padding->delta  = (uint32_t)(delta);
+    mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
+    padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+    padding->delta  = (uint32_t)(delta);
     #if MI_PADDING_CHECK
     if (!mi_page_is_huge(page)) {
       uint8_t* fill = (uint8_t*)padding - delta;
@@ -105,6 +105,14 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
   return block;
 }
 
+// extra entries for improved efficiency in `alloc-aligned.c`.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  return _mi_page_malloc_zero(heap,page,size,false);
+}
+extern inline void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  return _mi_page_malloc_zero(heap,page,size,true);
+}
+
 static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
   mi_assert(heap != NULL);
   #if MI_DEBUG
@@ -117,7 +125,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   #endif
   
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* const p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);  
+  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);  
   mi_track_malloc(p,size,zero);
 
   #if MI_STAT>1
diff --git a/src/page-queue.c b/src/page-queue.c
index e4bfde14..02a8008d 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -113,10 +113,10 @@ size_t _mi_bin_size(uint8_t bin) {
 // Good size for allocation
 size_t mi_good_size(size_t size) mi_attr_noexcept {
   if (size <= MI_LARGE_OBJ_SIZE_MAX) {
-    return _mi_bin_size(mi_bin(size));
+    return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE));
   }
   else {
-    return _mi_align_up(size,_mi_os_page_size());
+    return _mi_align_up(size + MI_PADDING_SIZE,_mi_os_page_size());
   }
 }
 
diff --git a/src/page.c b/src/page.c
index 7e188522..5a18b780 100644
--- a/src/page.c
+++ b/src/page.c
@@ -914,12 +914,12 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al
   // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
   if mi_unlikely(zero && page->block_size == 0) {
     // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
-    void* p = _mi_page_malloc(heap, page, size, false);
+    void* p = _mi_page_malloc(heap, page, size);
     mi_assert_internal(p != NULL);
     _mi_memzero_aligned(p, mi_page_usable_block_size(page));
     return p;
   }
   else {
-    return _mi_page_malloc(heap, page, size, zero);
+    return _mi_page_malloc_zero(heap, page, size, zero);
   }
 }
diff --git a/src/segment.c b/src/segment.c
index b3fc60ee..cfd6c1a3 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -426,15 +426,13 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
   size_t   psize;
   uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
   const size_t block_size = mi_page_block_size(page);
-  if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
+  if (/*page->segment_idx == 0 &&*/ block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) {
     // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM);
     size_t adjust = block_size - ((uintptr_t)p % block_size);
-    if (psize - adjust >= block_size) {
-      if (adjust < block_size) {
-        p += adjust;
-        psize -= adjust;
-        // if (pre_size != NULL) *pre_size = adjust;
-      }
+    if (adjust < block_size && psize >= block_size + adjust) {
+      p += adjust;
+      psize -= adjust;
       mi_assert_internal((uintptr_t)p % block_size == 0);
     }
   }

From 0dcdc55bbd47e055a0402c7e26cfbbb884e4156d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 07:09:30 -0700
Subject: [PATCH 115/119] better aligned test

---
 test/test-api.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/test/test-api.c b/test/test-api.c
index 34bfa0e6..76101980 100644
--- a/test/test-api.c
+++ b/test/test-api.c
@@ -232,15 +232,20 @@ int main(void) {
   }
   CHECK_BODY("mimalloc-aligned13") {
     bool ok = true;
-    for( size_t size = 1; size <= MI_SMALL_SIZE_MAX && ok; size++ ) {
+    for( size_t size = 1; size <= (MI_SMALL_SIZE_MAX * 2) && ok; size++ ) {
       for(size_t align = 1; align <= size && ok; align *= 2 ) {
-        void* p = mi_malloc_aligned(size,align);
-        ok = (p != NULL && ((uintptr_t)p % align) == 0);
-        mi_free(p);       
+        void* p[10];
+        for(int i = 0; i < 10 && ok; i++) {
+          p[i] = mi_malloc_aligned(size,align);;
+          ok = (p[i] != NULL && ((uintptr_t)(p[i]) % align) == 0);
+        }
+        for(int i = 0; i < 10 && ok; i++) {
+          mi_free(p[i]);
+        }       
         /*
         if (ok && align <= size && ((size + MI_PADDING_SIZE) & (align-1)) == 0) {
           size_t bsize = mi_good_size(size);
-          ok = (align <= bsize && ((bsize + MI_PADDING_SIZE) & (align-1)) == 0);
+          ok = (align <= bsize && (bsize & (align-1)) == 0);
         }
         */
       }

From b1ec1d5e5a1120cb2210f71e7276fcb678c4123e Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 07:22:56 -0700
Subject: [PATCH 116/119] refactor aligned allocation

---
 include/mimalloc/types.h |  4 +--
 src/alloc-aligned.c      | 62 ++++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index bccd6115..35a3965e 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -200,13 +200,13 @@ typedef int32_t  mi_ssize_t;
 #error "mimalloc internal: define more bins"
 #endif
 
-// Maximum block size for which blocks are guarenteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
+// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
 #define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
 
 // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
 
-// we never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+// We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
 
 // ------------------------------------------------------
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index d80a6753..20c36044 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -24,27 +24,12 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
   return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0);
 }
 
-// Fallback primitive aligned allocation -- split out for better codegen
-static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+// Fallback aligned allocation that over-allocates -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
   mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE));
   mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
 
-  // use regular allocation if it is guaranteed to fit the alignment constraints.
-  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
-    void* p = _mi_heap_malloc_zero(heap, size, zero);
-    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
-    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
-    if mi_likely(is_aligned_or_null) {
-      return p;
-    }
-    else {
-      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
-      mi_assert(false);
-      mi_free(p); 
-    }
-  }
-
   void* p;
   size_t oversize;
   if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
@@ -104,6 +89,39 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t*
   return aligned_p;
 }
 
+// Generic primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+  // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { 
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  
+  // use regular allocation if it is guaranteed to fit the alignment constraints.
+  // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist
+  // a page with the right block size, and if we always use the over-alloc fallback that would never happen.
+  if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) {
+    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0;  
+    if mi_likely(is_aligned_or_null) {
+      return p;
+    }
+    else {
+      // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct..
+      mi_assert(false);
+      mi_free(p); 
+    }
+  }
+
+  // fall back to over-allocation
+  return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero);
+}
+
 // Primitive aligned allocation
 static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
 {
@@ -136,14 +154,8 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
     }
   }
 
-  // fallback
-  if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { // we don't allocate more than MI_MAX_ALLOC_SIZE (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-    #if MI_DEBUG > 0
-    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
-    #endif
-    return NULL;
-  }
-  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+  // fallback to generic aligned allocation
+  return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero);
 }
 
 

From 6c7cda592c2191c75ed53e940678e5d25ec1adb2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 07:39:06 -0700
Subject: [PATCH 117/119] make page_malloc_zero externals not inline to avoid
 link errors in C++ mode

---
 src/alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/alloc.c b/src/alloc.c
index ab30fd53..ce24b8ec 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -106,10 +106,10 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
 }
 
 // extra entries for improved efficiency in `alloc-aligned.c`.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+extern void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
   return _mi_page_malloc_zero(heap,page,size,false);
 }
-extern inline void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
   return _mi_page_malloc_zero(heap,page,size,true);
 }
 

From bb3802801cc95fac50a969d05f554d9737429e08 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 07:53:12 -0700
Subject: [PATCH 118/119] clarify parameters for emmalloc_memalign

---
 src/prim/emscripten/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index 8b011b4d..1f60a1bb 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -68,7 +68,7 @@ int _mi_prim_free(void* addr, size_t size) {
 // Allocation
 //---------------------------------------------
 
-extern void* emmalloc_memalign(size_t, size_t);
+extern void* emmalloc_memalign(size_t alignment, size_t size);
 
 // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
 int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {

From 0f6d8293c74796fa913e4b5eb4361f1e4734f7c6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 11 May 2024 08:08:03 -0700
Subject: [PATCH 119/119] use local dynamic tls for static MUSL builds (see
 issue #644)

---
 CMakeLists.txt | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6c95dc4..2cc2fc46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,9 @@ set(mi_sources
     src/prim/prim.c)
 
 set(mi_cflags "")
+set(mi_cflags_static "")            # extra flags for a static library build
+set(mi_cflags_dynamic "")           # extra flags for a shared-object library build
+set(mi_defines "")                   
 set(mi_libraries "")
 
 # -----------------------------------------------------------------------------
@@ -288,8 +291,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
 endif()
 
 if(MI_LIBC_MUSL)
-  message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON) (this implies MI_LOCAL_DYNAMIC_TLS=ON)")
-  set(MI_LOCAL_DYNAMIC_TLS "ON")
+  message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON)")
   list(APPEND mi_defines MI_LIBC_MUSL=1)
 endif()
 
@@ -318,7 +320,14 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
   if(MI_LOCAL_DYNAMIC_TLS)
     list(APPEND mi_cflags -ftls-model=local-dynamic)
   else()
-    list(APPEND mi_cflags -ftls-model=initial-exec)
+    if(MI_LIBC_MUSL)
+      # with musl we use local-dynamic for the static build, see issue #644
+      list(APPEND mi_cflags_static  -ftls-model=local-dynamic)
+      list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
+      message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
+    else()
+      list(APPEND mi_cflags -ftls-model=initial-exec)    
+    endif()
   endif()
   if(MI_OVERRIDE)
     list(APPEND mi_cflags -fno-builtin-malloc)
@@ -426,7 +435,7 @@ if(MI_BUILD_SHARED)
   add_library(mimalloc SHARED ${mi_sources})
   set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_basename} )
   target_compile_definitions(mimalloc PRIVATE ${mi_defines} MI_SHARED_LIB MI_SHARED_LIB_EXPORT)
-  target_compile_options(mimalloc PRIVATE ${mi_cflags})
+  target_compile_options(mimalloc PRIVATE ${mi_cflags} ${mi_cflags_dynamic})
   target_link_libraries(mimalloc PRIVATE ${mi_libraries})
   target_include_directories(mimalloc PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
@@ -456,7 +465,7 @@ if (MI_BUILD_STATIC)
   add_library(mimalloc-static STATIC ${mi_sources})
   set_property(TARGET mimalloc-static PROPERTY POSITION_INDEPENDENT_CODE ON)
   target_compile_definitions(mimalloc-static PRIVATE ${mi_defines} MI_STATIC_LIB)
-  target_compile_options(mimalloc-static PRIVATE ${mi_cflags})
+  target_compile_options(mimalloc-static PRIVATE ${mi_cflags} ${mi_cflags_static})
   target_link_libraries(mimalloc-static PRIVATE ${mi_libraries})
   target_include_directories(mimalloc-static PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
@@ -488,7 +497,7 @@ if (MI_BUILD_OBJECT)
   add_library(mimalloc-obj OBJECT src/static.c)
   set_property(TARGET mimalloc-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
   target_compile_definitions(mimalloc-obj PRIVATE ${mi_defines})
-  target_compile_options(mimalloc-obj PRIVATE ${mi_cflags})
+  target_compile_options(mimalloc-obj PRIVATE ${mi_cflags} ${mi_cflags_static})
   target_include_directories(mimalloc-obj PUBLIC
       $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
       $<INSTALL_INTERFACE:${mi_install_incdir}>