From 616db104a9147071a406a320fee6f51cf858ea74 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 1 Jun 2024 12:29:48 -0700
Subject: [PATCH 01/18] prevent UB in arena reservation

---
 include/mimalloc/internal.h | 8 ++++++++
 src/arena.c                 | 8 +++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 2954eabd..2a21f34b 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -329,6 +329,14 @@ static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
   return (divider == 0 ? size : ((size + divider - 1) / divider));
 }
 
+
+// clamp an integer
+static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) {
+  if (sz < min) return min;
+  else if (sz > max) return max;
+  else return sz;
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(const void* p, size_t size) {
   for (size_t i = 0; i < size; i++) {
diff --git a/src/arena.c b/src/arena.c
index 25ce56ec..445cc309 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -358,8 +358,14 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+  arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
-    arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 );
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
   }
   if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
 

From aeee7907a0324f6d7ee8b01b55721c4efe2dec7e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 1 Jun 2024 13:20:28 -0700
Subject: [PATCH 02/18] fix spelling

---
 src/arena.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 445cc309..83582bad 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -510,7 +510,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
   size_t bitidx = startidx;
   bool all_purged = false;
   while (bitidx < endidx) {
-    // count consequetive ones in the purge mask
+    // count consecutive ones in the purge mask
     size_t count = 0;
     while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
       count++;
@@ -547,7 +547,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
     if (purge != 0) {
       size_t bitidx = 0;
       while (bitidx < MI_BITMAP_FIELD_BITS) {
-        // find consequetive range of ones in the purge mask
+        // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
         while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
           bitlen++;
@@ -927,7 +927,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   arena->search_idx   = 0;
-  // consequetive bitmaps
+  // consecutive bitmaps
   arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
   arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
   arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap

From f87a4c15b285a6d4c04c8813db2a26ebba807a4d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 1 Jun 2024 13:41:13 -0700
Subject: [PATCH 03/18] increase max arenas

---
 src/arena.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arena.c b/src/arena.c
index 83582bad..d97bf628 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 typedef uintptr_t mi_block_info_t;
 #define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (112)                    // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
+#define MI_MAX_ARENAS         (255)                    // Limited as the reservation exponentially increases (and takes up .bss)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
@@ -552,6 +552,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
           bitlen++;
         }
+        // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
         const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
         while( bitlen > 0 ) {

From d9aa19a7636d457f0b7b50e599649b86e8ade666 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 1 Jun 2024 15:57:18 -0700
Subject: [PATCH 04/18] add support for sub-processes (to
 supportpython/cpython#113717)

---
 include/mimalloc.h          | 10 ++++++-
 include/mimalloc/internal.h |  9 ++++---
 include/mimalloc/types.h    | 20 ++++++++++++--
 src/arena.c                 | 54 ++++++++++++++++++++-----------------
 src/free.c                  |  3 ++-
 src/init.c                  | 43 ++++++++++++++++++++++++++---
 src/segment.c               | 24 ++++++++++-------
 7 files changed, 119 insertions(+), 44 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 0173a323..26bb849d 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -288,8 +288,16 @@ mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_co
 mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
 #endif
 
+
+// Experimental: allow sub-processes whose memory segments stay separated (and no reclamation between them) 
+// Used for example for separate interpreter's in one process.
+typedef void* mi_subproc_id_t;
+mi_decl_export mi_subproc_id_t mi_subproc_new(void);
+mi_decl_export void   mi_subproc_delete(mi_subproc_id_t subproc);
+mi_decl_export void   mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+
 // deprecated
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+mi_decl_export int    mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 
 
 // ------------------------------------------------------
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 2a21f34b..65cd3569 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -130,14 +130,17 @@ void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
 bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
 void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
-size_t     _mi_arena_segment_abandoned_count(void);
 
-typedef struct mi_arena_field_cursor_s { // abstract
+void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
+void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
+
+typedef struct mi_arena_field_cursor_s { // abstract struct
   mi_arena_id_t  start;
   int            count;
   size_t         bitmap_idx;
+  mi_subproc_t*  subproc;
 } mi_arena_field_cursor_t;
-void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current);
+void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current);
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
 
 // "segment-map.c"
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ed326c69..6b90bf5d 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -307,7 +307,7 @@ typedef struct mi_page_s {
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
   uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used for separated heaps by object type
+  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
   size_t                block_size;        // size available in each block (always `>0`)
   uint8_t*              page_start;        // start of the page area containing the blocks
@@ -387,6 +387,7 @@ typedef struct mi_memid_s {
 // ---------------------------------------------------------------
 // Segments contain mimalloc pages
 // ---------------------------------------------------------------
+typedef struct mi_subproc_s mi_subproc_t;
 
 // Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
 // Inside segments we allocated fixed size _pages_ that contain blocks.
@@ -409,6 +410,7 @@ typedef struct mi_segment_s {
   size_t               capacity;         // count of available pages (`#free + used`)
   size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
   uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
+  mi_subproc_t*        subproc;          // segment belongs to sub process
 
   // layout like this to optimize access in `mi_free`
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
@@ -600,10 +602,23 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
 
 
+// ------------------------------------------------------
+// Sub processes do not reclaim or visit segments
+// from other sub processes
+// ------------------------------------------------------
+
+struct mi_subproc_s {
+  _Atomic(size_t)    abandoned_count;   // count of abandoned segments for this sup-process
+  mi_memid_t         memid;             // provenance
+};
+
+mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id);
+
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
 
+// Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 
 // Queue of segments
@@ -628,8 +643,9 @@ typedef struct mi_segments_tld_s {
   size_t              current_size; // current size of all segments
   size_t              peak_size;    // peak size of all segments
   size_t              reclaim_count;// number of reclaimed (abandoned) segments
+  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
   mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os stats
+  mi_os_tld_t*        os;           // points to os tld
 } mi_segments_tld_t;
 
 // Thread local data
diff --git a/src/arena.c b/src/arena.c
index d97bf628..aeadd604 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -172,7 +172,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
   return p;
 }
 
-static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
+void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) {
   *memid = _mi_memid_none();
 
   // try static
@@ -180,7 +180,7 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   if (p != NULL) return p;
 
   // or fall back to the OS
-  p = _mi_os_alloc(size, memid, stats);
+  p = _mi_os_alloc(size, memid, &_mi_stats_main);
   if (p == NULL) return NULL;
 
   // zero the OS memory if needed
@@ -191,9 +191,9 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   return p;
 }
 
-static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) {
+void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
   if (mi_memkind_is_os(memid.memkind)) {
-    _mi_os_free(p, size, memid, stats);
+    _mi_os_free(p, size, memid, &_mi_stats_main);
   }
   else {
     mi_assert(memid.memkind == MI_MEM_STATIC);
@@ -709,7 +709,7 @@ static void mi_arenas_unsafe_destroy(void) {
       else {
         new_max_arena = i;
       }
-      mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main);
+      _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size);
     }
   }
 
@@ -752,13 +752,6 @@ bool _mi_arena_contains(const void* p) {
   the arena bitmaps.
 ----------------------------------------------------------- */
 
-// Maintain a count of all abandoned segments
-static mi_decl_cache_align _Atomic(size_t)abandoned_count;
-
-size_t _mi_arena_segment_abandoned_count(void) {
-  return mi_atomic_load_relaxed(&abandoned_count);
-}
-
 // reclaim a specific abandoned segment; `true` on success.
 // sets the thread_id.
 bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
@@ -768,7 +761,7 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment )
     // but we need to still claim it atomically -- we use the thread_id for that.
     size_t expected = 0;
     if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
-      mi_atomic_decrement_relaxed(&abandoned_count);
+      mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
       return true;
     }
     else {
@@ -785,7 +778,7 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment )
   bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
   if (was_marked) { 
     mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&abandoned_count); 
+    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); 
     mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   }
   // mi_assert_internal(was_marked);
@@ -802,9 +795,10 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
   mi_assert_internal(segment->used == segment->abandoned);
   if (segment->memid.memkind != MI_MEM_ARENA) {
     // not in an arena; count it as abandoned and return
-    mi_atomic_increment_relaxed(&abandoned_count);
+    mi_atomic_increment_relaxed(&segment->subproc->abandoned_count);
     return;
   }
+  // segment is in an arena
   size_t arena_idx;
   size_t bitmap_idx;
   mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
@@ -812,17 +806,19 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
   mi_assert_internal(arena != NULL);
   const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); }
+  if (was_unmarked) { mi_atomic_increment_relaxed(&segment->subproc->abandoned_count); }
   mi_assert_internal(was_unmarked);
   mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
 }
 
 // start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) {
+void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) {
+  mi_assert_internal(heap->tld->segments.subproc == subproc);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
   current->count = 0;
   current->bitmap_idx = 0;  
+  current->subproc = subproc;
 }
 
 // reclaim abandoned segments 
@@ -830,7 +826,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* curre
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
 {
   const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
-  if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL;
+  if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL;
 
   int count = previous->count;
   size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
@@ -853,14 +849,24 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
               mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
               // try to reclaim it atomically
               if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
-                mi_atomic_decrement_relaxed(&abandoned_count);
-                previous->bitmap_idx = bitmap_idx;
-                previous->count = count;
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
                 mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
                 mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                return segment;
+                // check that belongs to our sub-process
+                if (segment->subproc != previous->subproc) {
+                  // it is from another subprocess, re-mark it and continue searching
+                  const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+                  mi_assert_internal(was_zero);
+                }
+                else {
+                  // success, we unabandoned a segment in our sub-process
+                  mi_atomic_decrement_relaxed(&previous->subproc->abandoned_count);
+                  previous->bitmap_idx = bitmap_idx;
+                  previous->count = count;
+
+                  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                  return segment;
+                }
               }
             }
           }
@@ -911,7 +917,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   const size_t bitmaps = (memid.is_pinned ? 3 : 5);
   const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
   mi_memid_t meta_memid;
-  mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  mi_arena_t* arena   = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid);
   if (arena == NULL) return false;
 
   // already zero'd due to zalloc
diff --git a/src/free.c b/src/free.c
index c065d2f3..191ec9bf 100644
--- a/src/free.c
+++ b/src/free.c
@@ -240,7 +240,8 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   {
     // the segment is abandoned, try to reclaim it into our heap
     if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;
     }
diff --git a/src/init.c b/src/init.c
index 62bb69dd..1922907b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -125,18 +125,20 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
 extern mi_heap_t _mi_heap_main;
 
-static mi_tld_t tld_main = {
+static mi_decl_cache_align mi_subproc_t mi_subproc_default;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
   0, false,
-  &_mi_heap_main, &_mi_heap_main,
+  &_mi_heap_main, &_mi_heap_main, 
   { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
-    0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, &mi_subproc_default,
     &tld_main.stats, &tld_main.os
   }, // segments
   { 0, &tld_main.stats },  // os
   { MI_STATS_NULL }       // stats
 };
 
-mi_heap_t _mi_heap_main = {
+mi_decl_cache_align mi_heap_t _mi_heap_main = {
   &tld_main,
   MI_ATOMIC_VAR_INIT(NULL),
   0,                // thread id
@@ -179,6 +181,38 @@ mi_heap_t* _mi_heap_main_get(void) {
 }
 
 
+/* -----------------------------------------------------------
+  Sub process
+----------------------------------------------------------- */
+
+mi_subproc_id_t mi_subproc_new(void) {
+  mi_memid_t memid = _mi_memid_none();
+  mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
+  if (subproc == NULL) return NULL;
+  subproc->memid = memid;
+  return subproc;
+}
+
+mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+}
+
+void mi_subproc_delete(mi_subproc_id_t subproc_id) {
+  if (subproc_id == NULL) return;
+  mi_subproc_t* subproc = mi_subproc_from_id(subproc_id);
+  _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+}
+
+void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
+  mi_heap_t* heap = mi_heap_get_default();
+  if (heap == NULL) return;
+  mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
+  if (heap->tld->segments.subproc != &mi_subproc_default) return;
+  heap->tld->segments.subproc = mi_subproc_from_id(subproc_id);
+}
+
+
+
 /* -----------------------------------------------------------
   Initialization and freeing of the thread local heaps
 ----------------------------------------------------------- */
@@ -295,6 +329,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   _mi_memzero_aligned(tld,sizeof(mi_tld_t));
   tld->heap_backing = bheap;
   tld->heaps = NULL;
+  tld->segments.subproc = &mi_subproc_default;
   tld->segments.stats = &tld->stats;
   tld->segments.os = &tld->os;
   tld->os.stats = &tld->stats;
diff --git a/src/segment.c b/src/segment.c
index fc13d2e7..205d8753 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -628,7 +628,8 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   segment->page_shift = page_shift;
   segment->segment_info_size = pre_size;
   segment->thread_id  = _mi_thread_id();
-  segment->cookie = _mi_ptr_cookie(segment);
+  segment->cookie     = _mi_ptr_cookie(segment);
+  segment->subproc    = tld->subproc;
 
   // set protection
   mi_segment_protect(segment, true, tld->os);
@@ -880,6 +881,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
   if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
   // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free.
   mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id());
+  mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess
   mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
   segment->abandoned_visits = 0;
   segment->was_reclaimed = true;
@@ -899,12 +901,13 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       segment->abandoned--;
       mi_assert(page->next == NULL);
       _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
-      // set the heap again and allow heap thread delayed free again.
+      // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap)     
       mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
       if (target_heap == NULL) {
         target_heap = heap;
-        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag );
+        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using tag %u instead)\n", page->heap_tag, heap->tag );
       }
+      // associate the heap with this page, and allow heap thread delayed free again.
       mi_page_set_heap(page, target_heap);
       _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
       _mi_page_free_collect(page, false); // ensure used count is up to date
@@ -944,7 +947,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`)
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
-  // don't reclaim more from a free than half the current segments
+  if (segment->subproc != heap->tld->segments.subproc)  return false;  // only reclaim within the same subprocess
+  // don't reclaim more from a `free` call than half the current segments
   // this is to prevent a pure free-ing thread to start owning too many segments
   if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
   if (_mi_arena_segment_clear_abandoned(segment)) {  // atomically unabandon
@@ -957,17 +961,17 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
 
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, &current);
   while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
 
-static long mi_segment_get_reclaim_tries(void) {
+static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) {
   // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries.
   const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100);
   if (perc <= 0) return 0;
-  const size_t total_count = _mi_arena_segment_abandoned_count();
+  const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count);
   if (total_count == 0) return 0;
   const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow
   long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count));
@@ -978,13 +982,14 @@ static long mi_segment_get_reclaim_tries(void) {
 static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld)
 {
   *reclaimed = false;
-  long max_tries = mi_segment_get_reclaim_tries();
+  long max_tries = mi_segment_get_reclaim_tries(tld);
   if (max_tries <= 0) return NULL;
 
   mi_segment_t* segment;
-  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, &current);
+  mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, &current);
   while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
   {
+    mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process
     segment->abandoned_visits++;
     // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit?
     // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries
@@ -1232,5 +1237,6 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag
   mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
   // mi_segment_try_purge(tld);
   mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
 }

From 0b3cd5124999efc673afb26bab3f5a1c8eff4c22 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 1 Jun 2024 16:45:20 -0700
Subject: [PATCH 05/18] add initial primitive api for locks

---
 include/mimalloc/atomic.h   | 21 +++++++++-------
 include/mimalloc/internal.h |  5 ----
 include/mimalloc/prim.h     | 24 +++++++++++++++---
 include/mimalloc/track.h    |  8 ++----
 src/alloc.c                 | 10 ++++----
 src/prim/emscripten/prim.c  | 49 ++++++++++++++++++++++++++++++++++++-
 src/prim/unix/prim.c        | 46 ++++++++++++++++++++++++++++++++++
 src/prim/wasi/prim.c        | 48 +++++++++++++++++++++++++++++++++---
 src/prim/windows/prim.c     | 35 +++++++++++++++++++-------
 test/main-override.cpp      |  4 +--
 test/test-stress.c          |  6 ++---
 11 files changed, 208 insertions(+), 48 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index d5333dd9..2c313fdb 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -8,6 +8,17 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_ATOMIC_H
 #define MIMALLOC_ATOMIC_H
 
+// include windows.h or pthreads.h
+#if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#elif !defined(_WIN32) && (defined(__EMSCRIPTEN_SHARED_MEMORY__) || !defined(__wasi__))
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
 // --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
@@ -133,10 +144,6 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
 #elif defined(_MSC_VER)
 
 // Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics.
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
@@ -306,7 +313,7 @@ typedef _Atomic(uintptr_t) mi_atomic_once_t;
 
 // Returns true only on the first invocation
 static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
-  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1
 }
@@ -329,10 +336,6 @@ static inline void mi_atomic_yield(void) {
   std::this_thread::yield();
 }
 #elif defined(_WIN32)
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 65cd3569..9046e3ad 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -53,11 +53,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_externc
 #endif
 
-// pthreads
-#if !defined(_WIN32) && !defined(__wasi__)
-#define  MI_USE_PTHREADS
-#include <pthread.h>
-#endif
 
 // "options.c"
 void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 3f4574dd..ba305dc1 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -114,6 +114,24 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
+// Locks are only used if abandoned segment visiting is permitted
+#if defined(_WIN32)
+#define mi_lock_t  CRITICAL_SECTION 
+#elif defined(MI_USE_PTHREADS)
+#define mi_lock_t  pthread_mutex_t
+#else
+#define mi_lock_t  _Atomic(uintptr_t)
+#endif
+
+// Take a lock (blocking). Return `true` on success.
+bool _mi_prim_lock(mi_lock_t* lock);
+
+// Try to take lock and return `true` if successful.
+bool _mi_prim_try_lock(mi_lock_t* lock);
+
+// Release a lock.
+void _mi_prim_unlock(mi_lock_t* lock);
+
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
@@ -235,10 +253,6 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 
 #elif defined(_WIN32)
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Windows: works on Intel and ARM in both 32- and 64-bit
   return (uintptr_t)NtCurrentTeb();
@@ -370,4 +384,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) {
 
 
 
+
+
 #endif  // MIMALLOC_PRIM_H
diff --git a/include/mimalloc/track.h b/include/mimalloc/track.h
index a659d940..4b5709e2 100644
--- a/include/mimalloc/track.h
+++ b/include/mimalloc/track.h
@@ -34,7 +34,7 @@ The corresponding `mi_track_free` still uses the block start pointer and origina
 The `mi_track_resize` is currently unused but could be called on reallocations within a block.
 `mi_track_init` is called at program start.
 
-The following macros are for tools like asan and valgrind to track whether memory is 
+The following macros are for tools like asan and valgrind to track whether memory is
 defined, undefined, or not accessible at all:
 
   #define mi_track_mem_defined(p,size)
@@ -82,10 +82,6 @@ defined, undefined, or not accessible at all:
 #define MI_TRACK_HEAP_DESTROY 1
 #define MI_TRACK_TOOL         "ETW"
 
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
 #include "../src/prim/windows/etw.h"
 
 #define mi_track_init()                           EventRegistermicrosoft_windows_mimalloc();
@@ -96,7 +92,7 @@ defined, undefined, or not accessible at all:
 // no tracking
 
 #define MI_TRACK_ENABLED      0
-#define MI_TRACK_HEAP_DESTROY 0 
+#define MI_TRACK_HEAP_DESTROY 0
 #define MI_TRACK_TOOL         "none"
 
 #define mi_track_malloc_size(p,reqsize,size,zero)
diff --git a/src/alloc.c b/src/alloc.c
index 6c9c5baf..5ba8bb33 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -28,7 +28,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
 // Note: in release mode the (inlined) routine is about 7 instructions with a single test.
-extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept 
+extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept
 {
   mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size);
   mi_block_t* const block = page->free;
@@ -61,7 +61,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
     }
     else {
       _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE);
-    }    
+    }
   }
 
   #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN
@@ -123,9 +123,9 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   #if (MI_PADDING)
   if (size == 0) { size = sizeof(void*); }
   #endif
-  
+
   mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
-  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);  
+  void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero);
   mi_track_malloc(p,size,zero);
 
   #if MI_STAT>1
@@ -362,7 +362,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
-#include <windows.h>
+
 mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
   // todo: use GetFullPathNameW to allow longer file names
   char buf[PATH_MAX];
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index f3797c9e..6b5aa452 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -200,7 +200,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 // Thread init/done
 //----------------------------------------------------------------
 
-#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+#if defined(MI_USE_PTHREADS)
 
 // use pthread local storage keys to detect thread ending
 // (and used with MI_TLS_PTHREADS for the default heap)
@@ -242,3 +242,50 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 
 }
 #endif
+
+//----------------------------------------------------------------
+// Locks
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+bool _mi_prim_lock(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+
+#else
+
+#include <emscripten.h>
+
+// fall back to poor man's locks.
+bool _mi_prim_lock(mi_lock_t* lock) {
+  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
+    if (_mi_prim_try_lock(lock)) return true;
+    if (i < 25) {
+      mi_atomic_yield();           // first yield a bit
+    }
+    else {
+      emscripten_sleep(1);         // then sleep for 1ms intervals
+    }
+  }
+  return true;
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  mi_atomic_store_release(lock,(uintptr_t)0);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 99325d03..7935c1c6 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -880,3 +880,49 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }
 
 #endif
+
+
+//----------------------------------------------------------------
+// Locks
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+bool _mi_prim_lock(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+
+#else
+
+// fall back to poor man's locks.
+bool _mi_prim_lock(mi_lock_t* lock) {
+  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
+    if (_mi_prim_try_lock(lock)) return true;
+    if (i < 25) {
+      mi_atomic_yield();    // first yield a bit
+    }
+    else {
+      usleep(1000);         // then sleep for 1ms intervals
+    }
+  }
+  return true;
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  mi_atomic_store_release(lock,(uintptr_t)0);
+}
+
+#endif
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index e95f67f5..3f3a2ea1 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -22,7 +22,7 @@ terms of the MIT license. A copy of the license can be found in the file
 void _mi_prim_mem_init( mi_os_mem_config_t* config ) {
   config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
   config->alloc_granularity = 16;
-  config->has_overcommit = false;  
+  config->has_overcommit = false;
   config->has_partial_free = false;
   config->has_virtual_reserve = false;
 }
@@ -134,7 +134,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la
 //---------------------------------------------
 
 int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
-  MI_UNUSED(addr); MI_UNUSED(size); 
+  MI_UNUSED(addr); MI_UNUSED(size);
   *is_zero = false;
   return 0;
 }
@@ -199,9 +199,9 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // low resolution timer
 mi_msecs_t _mi_prim_clock_now(void) {
   #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0)
-  return (mi_msecs_t)clock();  
+  return (mi_msecs_t)clock();
   #elif (CLOCKS_PER_SEC < 1000)
-  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);  
+  return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC);
   #else
   return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000);
   #endif
@@ -278,3 +278,43 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   MI_UNUSED(heap);
 }
+
+//----------------------------------------------------------------
+// Locks
+//----------------------------------------------------------------
+
+#if defined(MI_USE_PTHREADS)
+
+bool _mi_prim_lock(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+
+#else
+
+// fall back to poor man's locks.
+bool _mi_prim_lock(mi_lock_t* lock) {
+  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
+    if (_mi_prim_try_lock(lock)) return true;
+    mi_atomic_yield();             // this should never happen as wasi is single threaded?
+  }
+  return true;
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  mi_atomic_store_release(lock,(uintptr_t)0);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 5074ad4c..760debb3 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -231,7 +231,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen
     else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) &&
               (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 &&
               win_is_out_of_memory_error(GetLastError())) {
-      // if committing regular memory and being out-of-memory, 
+      // if committing regular memory and being out-of-memory,
       // keep trying for a bit in case memory frees up after all. See issue #894
       _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags);
       long sleep_msecs = tries*40;  // increasing waits
@@ -316,7 +316,7 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
   return 0;
 }
 
-int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {  
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
   BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT);
   *needs_recommit = true;  // for safety, assume always decommitted even in the case of an error.
   return (ok ? 0 : (int)GetLastError());
@@ -468,7 +468,6 @@ mi_msecs_t _mi_prim_clock_now(void) {
 // Process Info
 //----------------------------------------------------------------
 
-#include <windows.h>
 #include <psapi.h>
 
 static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
@@ -491,7 +490,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
   pinfo->utime = filetime_msecs(&ut);
   pinfo->stime = filetime_msecs(&st);
-  
+
   // load psapi on demand
   if (pGetProcessMemoryInfo == NULL) {
     HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
@@ -505,7 +504,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
   memset(&info, 0, sizeof(info));
   if (pGetProcessMemoryInfo != NULL) {
     pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  } 
+  }
   pinfo->current_rss    = (size_t)info.WorkingSetSize;
   pinfo->peak_rss       = (size_t)info.PeakWorkingSetSize;
   pinfo->current_commit = (size_t)info.PagefileUsage;
@@ -517,7 +516,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo)
 // Output
 //----------------------------------------------------------------
 
-void _mi_prim_out_stderr( const char* msg ) 
+void _mi_prim_out_stderr( const char* msg )
 {
   // on windows with redirection, the C runtime cannot handle locale dependent output
   // after the main thread closes so we use direct console output.
@@ -564,6 +563,23 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 }
 
 
+//----------------------------------------------------------------
+// Locks
+//----------------------------------------------------------------
+
+bool _mi_prim_lock(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  return true;
+}
+
+bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+
+void _mi_prim_unlock(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+
 
 //----------------------------------------------------------------
 // Random
@@ -600,7 +616,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
     }
     if (pBCryptGenRandom == NULL) return false;
   }
-  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);  
+  return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
 
 #endif  // MI_USE_RTLGENRANDOM
@@ -636,9 +652,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // call thread-done on all threads (except the main thread) to prevent 
+  // call thread-done on all threads (except the main thread) to prevent
   // dangling callback pointer if statically linked with a DLL; Issue #208
-  FlsFree(mi_fls_key);  
+  FlsFree(mi_fls_key);
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
@@ -661,3 +677,4 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }
 
 #endif
+
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 64ea178b..fc7f70f0 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -19,7 +19,7 @@
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 static void msleep(unsigned long msecs) { Sleep(msecs); }
 #else
 #include <unistd.h>
@@ -43,7 +43,7 @@ static void test_stl_allocators();
 
 int main() {
   // mi_stats_reset();  // ignore earlier allocations
-  
+
   test_std_string();
   // heap_thread_free_huge();
   /*
diff --git a/test/test-stress.c b/test/test-stress.c
index 14b3c3ae..0368007a 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -200,7 +200,7 @@ static void test_stress(void) {
     #ifndef NDEBUG
     //mi_collect(false);
     //mi_debug_show_arenas();
-    #endif    
+    #endif
     #if !defined(NDEBUG) || defined(MI_TSAN)
     if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
     #endif
@@ -232,7 +232,7 @@ static void test_leak(void) {
 int main(int argc, char** argv) {
   #ifndef USE_STD_MALLOC
     mi_stats_reset();
-  #endif  
+  #endif
 
   // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
   if (argc >= 2) {
@@ -285,7 +285,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;
 
 #ifdef _WIN32
 
-#include <Windows.h>
+#include <windows.h>
 
 static DWORD WINAPI thread_entry(LPVOID param) {
   thread_entry_fun((intptr_t)param);

From f93fb900b7495d320b2cfae4e69f1091917d278d Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 1 Jun 2024 17:25:45 -0700
Subject: [PATCH 06/18] move lock code to atomic.h

---
 include/mimalloc/atomic.h  | 91 ++++++++++++++++++++++++++++++++++++++
 include/mimalloc/prim.h    | 17 -------
 include/mimalloc/types.h   |  2 -
 src/init.c                 |  2 +-
 src/prim/emscripten/prim.c | 47 --------------------
 src/prim/unix/prim.c       | 47 --------------------
 src/prim/wasi/prim.c       | 41 -----------------
 src/prim/windows/prim.c    | 19 --------
 8 files changed, 92 insertions(+), 174 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 2c313fdb..4e3250f9 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -309,6 +309,11 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
+
+// ----------------------------------------------------------------------
+// Once and Guard
+// ----------------------------------------------------------------------
+
 typedef _Atomic(uintptr_t) mi_atomic_once_t;
 
 // Returns true only on the first invocation
@@ -329,7 +334,9 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t;
 
 
 
+// ----------------------------------------------------------------------
 // Yield
+// ----------------------------------------------------------------------
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
@@ -393,4 +400,88 @@ static inline void mi_atomic_yield(void) {
 #endif
 
 
+// ----------------------------------------------------------------------
+// Locks are only used for abandoned segment visiting
+// ----------------------------------------------------------------------
+#if defined(_WIN32)
+
+#define mi_lock_t  CRITICAL_SECTION 
+
+static inline bool _mi_prim_lock(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  return true;
+}
+
+static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+
+static inline void _mi_prim_unlock(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+
+
+#elif defined(MI_USE_PTHREADS)
+
+#define mi_lock_t  pthread_mutex_t
+
+static inline bool _mi_prim_lock(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+
+static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return (pthread_mutex_trylock(lock) == 0);
+}
+
+static inline void _mi_prim_unlock(mi_lock_t* lock) {
+  pthread_mutex_unlock(lock);
+}
+
+#elif defined(__cplusplus)
+
+#include <mutex>
+#define mi_lock_t  std::mutex
+
+static inline bool _mi_prim_lock(mi_lock_t* lock) {
+  lock->lock();
+  return true;
+}
+
+static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+  return (lock->try_lock();
+}
+
+static inline void _mi_prim_unlock(mi_lock_t* lock) {
+  lock->unlock();
+}
+
+#else
+
+// fall back to poor man's locks.
+// this should only be the case in a single-threaded environment (like __wasi__)
+
+#define mi_lock_t  _Atomic(uintptr_t)
+
+static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
+}
+
+static inline bool _mi_prim_lock(mi_lock_t* lock) {
+  for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
+    if (_mi_prim_try_lock(lock)) return true;
+    mi_atomic_yield();
+  }
+  return true;
+}
+
+static inline void _mi_prim_unlock(mi_lock_t* lock) {
+  mi_atomic_store_release(lock, (uintptr_t)0);
+}
+
+#endif
+
+
+
+
 #endif // __MIMALLOC_ATOMIC_H
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index ba305dc1..640c966f 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -114,23 +114,6 @@ void _mi_prim_thread_done_auto_done(void);
 // Called when the default heap for a thread changes
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
-// Locks are only used if abandoned segment visiting is permitted
-#if defined(_WIN32)
-#define mi_lock_t  CRITICAL_SECTION 
-#elif defined(MI_USE_PTHREADS)
-#define mi_lock_t  pthread_mutex_t
-#else
-#define mi_lock_t  _Atomic(uintptr_t)
-#endif
-
-// Take a lock (blocking). Return `true` on success.
-bool _mi_prim_lock(mi_lock_t* lock);
-
-// Try to take lock and return `true` if successful.
-bool _mi_prim_try_lock(mi_lock_t* lock);
-
-// Release a lock.
-void _mi_prim_unlock(mi_lock_t* lock);
 
 
 //-------------------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 6b90bf5d..f4ba6739 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -612,8 +612,6 @@ struct mi_subproc_s {
   mi_memid_t         memid;             // provenance
 };
 
-mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id);
-
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------
diff --git a/src/init.c b/src/init.c
index 1922907b..01625891 100644
--- a/src/init.c
+++ b/src/init.c
@@ -193,7 +193,7 @@ mi_subproc_id_t mi_subproc_new(void) {
   return subproc;
 }
 
-mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+static mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) {
   return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
 }
 
diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
index 6b5aa452..944c0cb4 100644
--- a/src/prim/emscripten/prim.c
+++ b/src/prim/emscripten/prim.c
@@ -242,50 +242,3 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 
 }
 #endif
-
-//----------------------------------------------------------------
-// Locks
-//----------------------------------------------------------------
-
-#if defined(MI_USE_PTHREADS)
-
-bool _mi_prim_lock(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return (pthread_mutex_trylock(lock) == 0);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  pthread_mutex_unlock(lock);
-}
-
-#else
-
-#include <emscripten.h>
-
-// fall back to poor man's locks.
-bool _mi_prim_lock(mi_lock_t* lock) {
-  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
-    if (_mi_prim_try_lock(lock)) return true;
-    if (i < 25) {
-      mi_atomic_yield();           // first yield a bit
-    }
-    else {
-      emscripten_sleep(1);         // then sleep for 1ms intervals
-    }
-  }
-  return true;
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  uintptr_t expected = 0;
-  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  mi_atomic_store_release(lock,(uintptr_t)0);
-}
-
-#endif
\ No newline at end of file
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 7935c1c6..93785b22 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -22,7 +22,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <sys/mman.h>  // mmap
@@ -880,49 +879,3 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
 }
 
 #endif
-
-
-//----------------------------------------------------------------
-// Locks
-//----------------------------------------------------------------
-
-#if defined(MI_USE_PTHREADS)
-
-bool _mi_prim_lock(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return (pthread_mutex_trylock(lock) == 0);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  pthread_mutex_unlock(lock);
-}
-
-#else
-
-// fall back to poor man's locks.
-bool _mi_prim_lock(mi_lock_t* lock) {
-  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
-    if (_mi_prim_try_lock(lock)) return true;
-    if (i < 25) {
-      mi_atomic_yield();    // first yield a bit
-    }
-    else {
-      usleep(1000);         // then sleep for 1ms intervals
-    }
-  }
-  return true;
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  uintptr_t expected = 0;
-  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  mi_atomic_store_release(lock,(uintptr_t)0);
-}
-
-#endif
diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c
index 3f3a2ea1..5d7a8132 100644
--- a/src/prim/wasi/prim.c
+++ b/src/prim/wasi/prim.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 
 #include <stdio.h>   // fputs
@@ -278,43 +277,3 @@ void _mi_prim_thread_done_auto_done(void) {
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
   MI_UNUSED(heap);
 }
-
-//----------------------------------------------------------------
-// Locks
-//----------------------------------------------------------------
-
-#if defined(MI_USE_PTHREADS)
-
-bool _mi_prim_lock(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return (pthread_mutex_trylock(lock) == 0);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  pthread_mutex_unlock(lock);
-}
-
-#else
-
-// fall back to poor man's locks.
-bool _mi_prim_lock(mi_lock_t* lock) {
-  for(int i = 0; i < 1000; i++) {  // for at most 1 second?
-    if (_mi_prim_try_lock(lock)) return true;
-    mi_atomic_yield();             // this should never happen as wasi is single threaded?
-  }
-  return true;
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  uintptr_t expected = 0;
-  return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  mi_atomic_store_release(lock,(uintptr_t)0);
-}
-
-#endif
\ No newline at end of file
diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c
index 760debb3..bd874f9b 100644
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"
 #include <stdio.h>   // fputs, stderr
 
@@ -563,24 +562,6 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 }
 
 
-//----------------------------------------------------------------
-// Locks
-//----------------------------------------------------------------
-
-bool _mi_prim_lock(mi_lock_t* lock) {
-  EnterCriticalSection(lock);
-  return true;
-}
-
-bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return TryEnterCriticalSection(lock);
-}
-
-void _mi_prim_unlock(mi_lock_t* lock) {
-  LeaveCriticalSection(lock);
-}
-
-
 //----------------------------------------------------------------
 // Random
 //----------------------------------------------------------------

From 8f874555d5d42c4e1006bfc78f6cadfb167b1e30 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 07:47:08 -0700
Subject: [PATCH 07/18] add initial support for visiting abandoned segments per
 subprocess, upstream for python/cpython#114133

---
 include/mimalloc.h          | 11 +++--
 include/mimalloc/atomic.h   | 83 ++++++++++++++++++++-------------
 include/mimalloc/internal.h | 10 ++--
 src/arena.c                 | 93 +++++++++++++++++++++++++++----------
 src/heap.c                  | 45 ++++++++++--------
 src/init.c                  | 14 ++++--
 src/options.c               |  5 ++
 src/segment.c               | 33 ++++++++++++-
 8 files changed, 206 insertions(+), 88 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 26bb849d..9fc770cc 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -262,7 +262,7 @@ typedef struct mi_heap_area_s {
 
 typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
 
-mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // Experimental
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
@@ -292,9 +292,13 @@ mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t a
 // Experimental: allow sub-processes whose memory segments stay separated (and no reclamation between them) 
 // Used for example for separate interpreter's in one process.
 typedef void* mi_subproc_id_t;
+mi_decl_export mi_subproc_id_t mi_subproc_main(void);
 mi_decl_export mi_subproc_id_t mi_subproc_new(void);
-mi_decl_export void   mi_subproc_delete(mi_subproc_id_t subproc);
-mi_decl_export void   mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+mi_decl_export void            mi_subproc_delete(mi_subproc_id_t subproc);
+mi_decl_export void            mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
+
+// Experimental: visit abandoned heap areas (from threads that have been terminated)
+mi_decl_export bool   mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // deprecated
 mi_decl_export int    mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
@@ -355,6 +359,7 @@ typedef enum mi_option_e {
   mi_option_abandoned_reclaim_on_free,  // allow to reclaim an abandoned segment on a free (=1)
   mi_option_disallow_arena_alloc,       // 1 = do not use arena's for allocation (except if using specific arena id's)
   mi_option_retry_on_oom,               // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows)
+  mi_option_visit_abandoned,            // allow visiting heap blocks from abandoned threads (=0)
   _mi_option_last,
   // legacy option names
   mi_option_large_os_pages = mi_option_allow_large_os_pages,
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 4e3250f9..d2711019 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -14,7 +14,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define WIN32_LEAN_AND_MEAN
 #endif
 #include <windows.h>
-#elif !defined(_WIN32) && (defined(__EMSCRIPTEN_SHARED_MEMORY__) || !defined(__wasi__))
+#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__))
 #define  MI_USE_PTHREADS
 #include <pthread.h>
 #endif
@@ -35,9 +35,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #define  mi_atomic(name)        std::atomic_##name
 #define  mi_memory_order(name)  std::memory_order_##name
 #if (__cplusplus >= 202002L)    // c++20, see issue #571
-#define MI_ATOMIC_VAR_INIT(x)  x
+ #define MI_ATOMIC_VAR_INIT(x)  x
 #elif !defined(ATOMIC_VAR_INIT)
-#define MI_ATOMIC_VAR_INIT(x)  x
+ #define MI_ATOMIC_VAR_INIT(x)  x
 #else
  #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
 #endif
@@ -337,6 +337,7 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t;
 // ----------------------------------------------------------------------
 // Yield
 // ----------------------------------------------------------------------
+
 #if defined(__cplusplus)
 #include <thread>
 static inline void mi_atomic_yield(void) {
@@ -401,59 +402,73 @@ static inline void mi_atomic_yield(void) {
 
 
 // ----------------------------------------------------------------------
-// Locks are only used for abandoned segment visiting
+// Locks are only used for abandoned segment visiting in `arena.c`
 // ----------------------------------------------------------------------
+
 #if defined(_WIN32)
 
-#define mi_lock_t  CRITICAL_SECTION 
+#define mi_lock_t  CRITICAL_SECTION
 
-static inline bool _mi_prim_lock(mi_lock_t* lock) {
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
   EnterCriticalSection(lock);
   return true;
 }
-
-static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return TryEnterCriticalSection(lock);
-}
-
-static inline void _mi_prim_unlock(mi_lock_t* lock) {
+static inline void mi_lock_release(mi_lock_t* lock) {
   LeaveCriticalSection(lock);
 }
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
 
 
 #elif defined(MI_USE_PTHREADS)
 
 #define mi_lock_t  pthread_mutex_t
 
-static inline bool _mi_prim_lock(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
-}
-
-static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return (pthread_mutex_trylock(lock) == 0);
 }
-
-static inline void _mi_prim_unlock(mi_lock_t* lock) {
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  return (pthread_mutex_lock(lock) == 0);
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
   pthread_mutex_unlock(lock);
 }
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
+
 
 #elif defined(__cplusplus)
 
 #include <mutex>
 #define mi_lock_t  std::mutex
 
-static inline bool _mi_prim_lock(mi_lock_t* lock) {
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return lock->lock_try_acquire();
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
   lock->lock();
   return true;
 }
-
-static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
-  return (lock->try_lock();
-}
-
-static inline void _mi_prim_unlock(mi_lock_t* lock) {
+static inline void mi_lock_release(mi_lock_t* lock) {
   lock->unlock();
 }
+static inline void mi_lock_init(mi_lock_t* lock) {
+  (void)(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
 
 #else
 
@@ -462,22 +477,26 @@ static inline void _mi_prim_unlock(mi_lock_t* lock) {
 
 #define mi_lock_t  _Atomic(uintptr_t)
 
-static inline bool _mi_prim_try_lock(mi_lock_t* lock) {
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
 }
-
-static inline bool _mi_prim_lock(mi_lock_t* lock) {
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
   for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
-    if (_mi_prim_try_lock(lock)) return true;
+    if (mi_lock_try_acquire(lock)) return true;
     mi_atomic_yield();
   }
   return true;
 }
-
-static inline void _mi_prim_unlock(mi_lock_t* lock) {
+static inline void mi_lock_release(mi_lock_t* lock) {
   mi_atomic_store_release(lock, (uintptr_t)0);
 }
+static inline void mi_lock_init(mi_lock_t* lock) {
+  mi_lock_release(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  (void)(lock);
+}
 
 #endif
 
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 9046e3ad..89f04103 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -79,11 +79,12 @@ extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 bool       _mi_is_main_thread(void);
 size_t     _mi_current_thread_count(void);
 bool       _mi_preloading(void);           // true while the C runtime is not initialized yet
-mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
-mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
 void       _mi_thread_done(mi_heap_t* heap);
 void       _mi_thread_data_collect(void);
 void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
+mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
@@ -136,7 +137,7 @@ typedef struct mi_arena_field_cursor_s { // abstract struct
   mi_subproc_t*  subproc;
 } mi_arena_field_cursor_t;
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current);
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all);
 
 // "segment-map.c"
 void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
@@ -158,6 +159,7 @@ void       _mi_segments_collect(bool force, mi_segments_tld_t* tld);
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);
 bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
+bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -189,6 +191,8 @@ void       _mi_heap_set_default_direct(mi_heap_t* heap);
 bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid);
 void       _mi_heap_unsafe_destroy_all(void);
 mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag);
+void       _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page);
+bool       _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg);
 
 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
diff --git a/src/arena.c b/src/arena.c
index aeadd604..59514950 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -40,23 +40,24 @@ typedef uintptr_t mi_block_info_t;
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
-  mi_arena_id_t id;                       // arena id; 0 for non-specific
-  mi_memid_t memid;                       // memid of the memory area
-  _Atomic(uint8_t*) start;                // the start of the memory area
-  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
-  mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
-  int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena
-  bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
-  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
-  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;    // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+  mi_arena_id_t       id;                   // arena id; 0 for non-specific
+  mi_memid_t          memid;                // memid of the memory area
+  _Atomic(uint8_t*)   start;                // the start of the memory area
+  size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
+  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
+  int                 numa_node;            // associated NUMA node
+  bool                exclusive;            // only allow allocations if specifically for this arena
+  bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
+  mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
+  _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.  
+  mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t*  blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
+  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
   // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
@@ -65,7 +66,6 @@ typedef struct mi_arena_s {
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
-
 //static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept;
 
 /* -----------------------------------------------------------
@@ -702,6 +702,7 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
       if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
         _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
@@ -813,9 +814,9 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
 
 // start a cursor at a randomized arena
 void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap->tld->segments.subproc == subproc);
+  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
+  current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
   current->count = 0;
   current->bitmap_idx = 0;  
   current->subproc = subproc;
@@ -823,7 +824,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren
 
 // reclaim abandoned segments 
 // this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) 
+mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) 
 {
   const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
   if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL;
@@ -831,18 +832,31 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
   int count = previous->count;
   size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
   size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
-  // visit arena's (from previous)
+  // visit arena's (from the previous cursor)
   for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
     mi_arena_id_t arena_idx = previous->start + count;
     if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
     if (arena != NULL) {
+      bool has_lock = false;      
       // visit the abandoned fields (starting at previous_idx)
-      for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
+      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
         size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
         if mi_unlikely(field != 0) { // skip zero fields quickly
+          // we only take the arena lock if there are actually abandoned segments present
+          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
+            has_lock = (visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
+            if (!has_lock) {
+              if (visit_all) {
+                _mi_error_message(EINVAL, "failed to visit all abandoned segments due to failure to acquire the visitor lock");
+              }
+              // skip to next arena
+              break;
+            }
+          }
+          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
           // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
+          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
             // pre-check if the bit is set
             size_t mask = ((size_t)1 << bit_idx);
             if mi_unlikely((field & mask) == mask) {
@@ -852,7 +866,10 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
                 mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
                 mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
                 mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                // check that belongs to our sub-process
+                // check that the segment belongs to our sub-process 
+                // note: this is the reason we need a lock in the case abandoned visiting is enabled.
+                //  without the lock an abandoned visit may otherwise fail to visit all segments.
+                //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock.
                 if (segment->subproc != previous->subproc) {
                   // it is from another subprocess, re-mark it and continue searching
                   const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
@@ -865,6 +882,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
                   previous->count = count;
 
                   //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                  if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
                   return segment;
                 }
               }
@@ -872,6 +890,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
           }
         }
       }
+      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
     }
   }
   // no more found
@@ -881,6 +900,29 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
 }
 
 
+static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_arena_field_cursor_t current;
+  _mi_arena_field_cursor_init(NULL, subproc, &current);
+  mi_segment_t* segment;
+  while ((segment = _mi_arena_segment_clear_abandoned_next(&current, true /* visit all */)) != NULL) {
+    if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) return false;
+  }
+  return true;
+}
+
+bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  // (unfortunately) the visit_abandoned option must be enabled from the start.
+  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
+  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
+    mi_assert(false);
+    _mi_error_message(EINVAL, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
+    return false;
+  }
+  // visit abandoned segments in the arena's  
+  return mi_arena_visit_abandoned_blocks(_mi_subproc_from_id(subproc_id), heap_tag, visit_blocks, visitor, arg);
+}
+
+
 /* -----------------------------------------------------------
   Add an arena.
 ----------------------------------------------------------- */
@@ -934,6 +976,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->is_large     = is_large;
   arena->purge_expire = 0;
   arena->search_idx   = 0;
+  mi_lock_init(&arena->abandoned_visit_lock);
   // consecutive bitmaps
   arena->blocks_dirty     = &arena->blocks_inuse[fields];     // just after inuse bitmap
   arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap
diff --git a/src/heap.c b/src/heap.c
index f6f23549..2cde5fb0 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -137,6 +137,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   {
     // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
     // if all memory is freed by now, all segments should be freed.
+    // note: this only collects in the current subprocess
     _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
   }
 
@@ -515,17 +516,21 @@ bool mi_check_owned(const void* p) {
         enable visiting all blocks of all heaps across threads
 ----------------------------------------------------------- */
 
-// Separate struct to keep `mi_page_t` out of the public interface
-typedef struct mi_heap_area_ex_s {
-  mi_heap_area_t area;
-  mi_page_t*     page;
-} mi_heap_area_ex_t;
+void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  area->reserved = page->reserved * bsize;
+  area->committed = page->capacity * bsize;
+  area->blocks = mi_page_start(page);
+  area->used = page->used;   // number of blocks in use (#553)
+  area->block_size = ubsize;
+  area->full_block_size = bsize;
+}
 
-static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
-  mi_assert(xarea != NULL);
-  if (xarea==NULL) return true;
-  const mi_heap_area_t* area = &xarea->area;
-  mi_page_t* page = xarea->page;
+
+bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(area != NULL);
+  if (area==NULL) return true;
   mi_assert(page != NULL);
   if (page == NULL) return true;
 
@@ -590,23 +595,23 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v
   return true;
 }
 
-typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
 
 
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t* page;
+} mi_heap_area_ex_t;
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
 static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
   MI_UNUSED(heap);
   MI_UNUSED(pq);
   mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
   mi_heap_area_ex_t xarea;
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page);
   xarea.page = page;
-  xarea.area.reserved = page->reserved * bsize;
-  xarea.area.committed = page->capacity * bsize;
-  xarea.area.blocks = mi_page_start(page);
-  xarea.area.used = page->used;   // number of blocks in use (#553)
-  xarea.area.block_size = ubsize;
-  xarea.area.full_block_size = bsize;
+  _mi_heap_area_init(&xarea.area, page);  
   return fun(heap, &xarea, arg);
 }
 
@@ -627,7 +632,7 @@ static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t*
   mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
   if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
   if (args->visit_blocks) {
-    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+    return _mi_heap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg);
   }
   else {
     return true;
diff --git a/src/init.c b/src/init.c
index 01625891..be8c16de 100644
--- a/src/init.c
+++ b/src/init.c
@@ -185,22 +185,30 @@ mi_heap_t* _mi_heap_main_get(void) {
   Sub process
 ----------------------------------------------------------- */
 
+static mi_decl_cache_align _Atomic(uintptr_t)  mi_subproc_count;
+
+mi_subproc_id_t mi_subproc_main(void) {
+  return NULL;
+}
+
 mi_subproc_id_t mi_subproc_new(void) {
   mi_memid_t memid = _mi_memid_none();
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
   if (subproc == NULL) return NULL;
+  mi_atomic_increment_relaxed(&mi_subproc_count);
   subproc->memid = memid;
   return subproc;
 }
 
-static mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) {
+mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
   return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
 }
 
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   if (subproc_id == NULL) return;
-  mi_subproc_t* subproc = mi_subproc_from_id(subproc_id);
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
   _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
+  mi_atomic_decrement_relaxed(&mi_subproc_count);
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
@@ -208,7 +216,7 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
   if (heap == NULL) return;
   mi_assert(heap->tld->segments.subproc == &mi_subproc_default);
   if (heap->tld->segments.subproc != &mi_subproc_default) return;
-  heap->tld->segments.subproc = mi_subproc_from_id(subproc_id);
+  heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id);
 }
 
 
diff --git a/src/options.c b/src/options.c
index db6e040f..32fa212b 100644
--- a/src/options.c
+++ b/src/options.c
@@ -94,6 +94,11 @@ static mi_option_desc_t options[_mi_option_last] =
   { 1,   UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free
   { 0,   UNINIT, MI_OPTION(disallow_arena_alloc) },     // 1 = do not use arena's for allocation (except if using specific arena id's)
   { 400, UNINIT, MI_OPTION(retry_on_oom) },             // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries.
+#if defined(MI_VISIT_ABANDONED)  
+  { 1,   INITIALIZED, MI_OPTION(visit_abandoned) },     // allow visiting heap blocks in abandonded segments; requires taking locks during reclaim.
+#else
+  { 0,   UNINIT, MI_OPTION(visit_abandoned) },          
+#endif
 };
 
 static void mi_option_init(mi_option_desc_t* desc);
diff --git a/src/segment.c b/src/segment.c
index 205d8753..dc82b89d 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -962,7 +962,7 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
 void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
   mi_segment_t* segment;
   mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, &current);
-  while ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
+  while ((segment = _mi_arena_segment_clear_abandoned_next(&current, true /* blocking */)) != NULL) {
     mi_segment_reclaim(segment, heap, 0, NULL, tld);
   }
 }
@@ -987,7 +987,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
 
   mi_segment_t* segment;
   mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, &current);
-  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL))
+  while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(&current, false /* non-blocking */)) != NULL))
   {
     mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process
     segment->abandoned_visits++;
@@ -1240,3 +1240,32 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag
   mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
   return page;
 }
+
+
+/* -----------------------------------------------------------
+   Visit blocks in a segment (only used for abandoned segments)
+----------------------------------------------------------- */
+
+static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false;
+  if (visit_blocks) {
+    return _mi_heap_area_visit_blocks(&area, page, visitor, arg);
+  }
+  else {
+    return true;
+  }
+}
+
+bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {  
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* const page = &segment->pages[i];
+    if (page->segment_in_use) {
+      if (heap_tag < 0 || (int)page->heap_tag == heap_tag) {
+        if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false;
+      }
+    }
+  }
+  return true;
+}

From 855e3b2549e0f2aa0277e43c4eeb8b1cbe1ea497 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 09:41:12 -0700
Subject: [PATCH 08/18] add support to visit _all_ abandoned segment blocks per
 sub-process, upstream for python/cpython#114133

---
 include/mimalloc/types.h |   7 +-
 src/arena.c              | 138 ++++++++++++++++++++++++++++-----------
 src/init.c               |  21 ++++--
 3 files changed, 121 insertions(+), 45 deletions(-)

diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index f4ba6739..2506d454 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -404,7 +404,7 @@ typedef struct mi_segment_s {
   bool                 was_reclaimed;    // true if it was reclaimed (used to limit on-free reclamation)
 
   size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim if it is too long)
+  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
 
   size_t               used;             // count of pages in use (`used <= capacity`)
   size_t               capacity;         // count of available pages (`#free + used`)
@@ -412,6 +412,9 @@ typedef struct mi_segment_s {
   uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
   mi_subproc_t*        subproc;          // segment belongs to sub process
 
+  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
+  struct mi_segment_s* abandoned_os_prev;
+
   // layout like this to optimize access in `mi_free`
   _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
   size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
@@ -609,6 +612,8 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
 struct mi_subproc_s {
   _Atomic(size_t)    abandoned_count;   // count of abandoned segments for this sup-process
+  mi_lock_t          abandoned_os_lock; // lock for the abandoned segments outside of arena's
+  mi_segment_t*      abandoned_os_list; // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
   mi_memid_t         memid;             // provenance
 };
 
diff --git a/src/arena.c b/src/arena.c
index 59514950..913a02a9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -757,17 +757,34 @@ bool _mi_arena_contains(const void* p) {
 // sets the thread_id.
 bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) 
 {
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena, consider it un-abandoned now.
-    // but we need to still claim it atomically -- we use the thread_id for that.
+  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena
+    // if abandoned visiting is allowed, we need to take a lock on the abandoned os list
+    bool has_lock = false;
+    if (mi_option_is_enabled(mi_option_visit_abandoned)) {
+      has_lock = mi_lock_try_acquire(&segment->subproc->abandoned_os_lock);
+      if (!has_lock) {
+        return false;  // failed to acquire the lock, we just give up
+      }
+    }
+    // abandon it, but we need to still claim it atomically -- we use the thread_id for that.
+    bool reclaimed = false;
     size_t expected = 0;
     if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) {
+      // reclaim
       mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-      return true;
-    }
-    else {
-      return false;
+      reclaimed = true;
+      // and remove from the abandoned os list (if needed)
+      mi_segment_t* const next = segment->abandoned_os_next;
+      mi_segment_t* const prev = segment->abandoned_os_prev;
+      if (prev != NULL) { prev->abandoned_os_next = next; }
+                   else { segment->subproc->abandoned_os_list = next;  }
+      if (next != NULL) { next->abandoned_os_prev = prev; } 
+      segment->abandoned_os_next = NULL;
+      segment->abandoned_os_prev = NULL;
     }
+    if (has_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
+    return reclaimed;    
   }
   // arena segment: use the blocks_abandoned bitmap.
   size_t arena_idx;
@@ -794,12 +811,30 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
 {
   mi_atomic_store_release(&segment->thread_id, 0);
   mi_assert_internal(segment->used == segment->abandoned);
-  if (segment->memid.memkind != MI_MEM_ARENA) {
-    // not in an arena; count it as abandoned and return
+  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
+    // not in an arena; count it as abandoned and return (these can be reclaimed on a `free`)
     mi_atomic_increment_relaxed(&segment->subproc->abandoned_count);
+    // if abandoned visiting is allowed, we need to take a lock on the abandoned os list to insert it
+    if (mi_option_is_enabled(mi_option_visit_abandoned)) {
+      if (!mi_lock_acquire(&segment->subproc->abandoned_os_lock)) {
+        _mi_error_message(EFAULT, "internal error: failed to acquire the abandoned (os) segment lock to mark abandonment");
+      }
+      else {
+        // push on the front of the list
+        mi_segment_t* next = segment->subproc->abandoned_os_list;
+        mi_assert_internal(next == NULL || next->abandoned_os_prev == NULL);
+        mi_assert_internal(segment->abandoned_os_prev == NULL);
+        mi_assert_internal(segment->abandoned_os_next == NULL);
+        if (next != NULL) { next->abandoned_os_prev = segment; }
+        segment->abandoned_os_prev = NULL;
+        segment->abandoned_os_next = next;
+        segment->subproc->abandoned_os_list = segment;
+        mi_lock_release(&segment->subproc->abandoned_os_lock);
+      }
+    }
     return;
   }
-  // segment is in an arena
+  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
   size_t arena_idx;
   size_t bitmap_idx;
   mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
@@ -822,6 +857,29 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren
   current->subproc = subproc;
 }
 
+static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {  
+  // try to reclaim an abandoned segment in the arena atomically
+  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
+  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
+  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
+  // check that the segment belongs to our sub-process 
+  // note: this is the reason we need a lock in the case abandoned visiting is enabled.
+  //  without the lock an abandoned visit may otherwise fail to visit all segments.
+  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock.
+  if (segment->subproc != subproc) {
+    // it is from another subprocess, re-mark it and continue searching
+    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
+    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
+    return NULL;
+  }
+  else {
+    // success, we unabandoned a segment in our sub-process
+    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
+    return segment;
+  }  
+}
+
 // reclaim abandoned segments 
 // this does not set the thread id (so it appears as still abandoned)
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) 
@@ -848,7 +906,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
             has_lock = (visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
             if (!has_lock) {
               if (visit_all) {
-                _mi_error_message(EINVAL, "failed to visit all abandoned segments due to failure to acquire the visitor lock");
+                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
               }
               // skip to next arena
               break;
@@ -860,31 +918,14 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
             // pre-check if the bit is set
             size_t mask = ((size_t)1 << bit_idx);
             if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              // try to reclaim it atomically
-              if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) {
-                mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-                mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-                mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-                // check that the segment belongs to our sub-process 
-                // note: this is the reason we need a lock in the case abandoned visiting is enabled.
-                //  without the lock an abandoned visit may otherwise fail to visit all segments.
-                //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock.
-                if (segment->subproc != previous->subproc) {
-                  // it is from another subprocess, re-mark it and continue searching
-                  const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-                  mi_assert_internal(was_zero);
-                }
-                else {
-                  // success, we unabandoned a segment in our sub-process
-                  mi_atomic_decrement_relaxed(&previous->subproc->abandoned_count);
-                  previous->bitmap_idx = bitmap_idx;
-                  previous->count = count;
-
-                  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                  if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                  return segment;
-                }
+              const mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
+              if (segment != NULL) {
+                previous->bitmap_idx = bitmap_idx;
+                previous->count = count;
+                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
+                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
+                return segment;
               }
             }
           }
@@ -910,16 +951,35 @@ static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag,
   return true;
 }
 
+static bool mi_subproc_visit_abandoned_os_blocks(mi_subproc_t* subproc, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  if (!mi_lock_acquire(&subproc->abandoned_os_lock)) {
+    _mi_error_message(EFAULT, "internal error: failed to acquire abandoned (OS) segment lock");
+    return false;
+  }
+  bool all_visited = true;
+  for (mi_segment_t* segment = subproc->abandoned_os_list; segment != NULL; segment = segment->abandoned_os_next) {
+    if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) {
+      all_visited = false;
+      break;
+    }
+  }
+  mi_lock_release(&subproc->abandoned_os_lock);
+  return all_visited;
+}
+
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
   // (unfortunately) the visit_abandoned option must be enabled from the start.
   // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
   if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    mi_assert(false);
-    _mi_error_message(EINVAL, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
+    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
     return false;
   }
+  mi_subproc_t* const subproc = _mi_subproc_from_id(subproc_id);
   // visit abandoned segments in the arena's  
-  return mi_arena_visit_abandoned_blocks(_mi_subproc_from_id(subproc_id), heap_tag, visit_blocks, visitor, arg);
+  if (!mi_arena_visit_abandoned_blocks(subproc, heap_tag, visit_blocks, visitor, arg)) return false;
+  // and visit abandoned segments outside arena's (in OS allocated memory)
+  if (!mi_subproc_visit_abandoned_os_blocks(subproc, heap_tag, visit_blocks, visitor, arg)) return false;
+  return true;
 }
 
 
diff --git a/src/init.c b/src/init.c
index be8c16de..f2d99d9e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -171,7 +171,8 @@ static void mi_heap_main_init(void) {
     #endif
     _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
     _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);    
+    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
   }
 }
 
@@ -185,8 +186,6 @@ mi_heap_t* _mi_heap_main_get(void) {
   Sub process
 ----------------------------------------------------------- */
 
-static mi_decl_cache_align _Atomic(uintptr_t)  mi_subproc_count;
-
 mi_subproc_id_t mi_subproc_main(void) {
   return NULL;
 }
@@ -195,8 +194,9 @@ mi_subproc_id_t mi_subproc_new(void) {
   mi_memid_t memid = _mi_memid_none();
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid);
   if (subproc == NULL) return NULL;
-  mi_atomic_increment_relaxed(&mi_subproc_count);
   subproc->memid = memid;
+  subproc->abandoned_os_list = NULL;
+  mi_lock_init(&subproc->abandoned_os_lock);
   return subproc;
 }
 
@@ -207,8 +207,19 @@ mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   if (subproc_id == NULL) return;
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  // check if there are no abandoned segments still..
+  bool safe_to_delete = false;
+  if (mi_lock_acquire(&subproc->abandoned_os_lock)) {
+    if (subproc->abandoned_os_list == NULL) {
+      safe_to_delete = true;
+    }
+    mi_lock_release(&subproc->abandoned_os_lock);
+  }
+  if (!safe_to_delete) return;
+  // safe to release
+  // todo: should we refcount subprocesses?
+  mi_lock_done(&subproc->abandoned_os_lock);
   _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t));
-  mi_atomic_decrement_relaxed(&mi_subproc_count);
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {

From f7fe5bf20ea8a88f8a55f58549e21dfeadc5dc1f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 10:28:30 -0700
Subject: [PATCH 09/18] optimize heap walks, by Sam Gross, upstream of
 python/cpython#114133

---
 src/heap.c         | 98 ++++++++++++++++++++++++++++++++++------------
 test/test-stress.c | 14 +++++++
 2 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/src/heap.c b/src/heap.c
index 2cde5fb0..be2800c1 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -528,46 +528,83 @@ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) {
 }
 
 
+static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) {
+  mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX);
+  *shift = 64 - mi_clz(divisor - 1);
+  *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1);  
+}
+
+static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) {
+  mi_assert_internal(n <= UINT32_MAX);
+  return ((((uint64_t)n * magic) >> 32) + n) >> shift;
+}
+
 bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) {
   mi_assert(area != NULL);
   if (area==NULL) return true;
   mi_assert(page != NULL);
   if (page == NULL) return true;
 
-  _mi_page_free_collect(page,true);
+  _mi_page_free_collect(page,true);              // collect both thread_delayed and local_free
   mi_assert_internal(page->local_free == NULL);
   if (page->used == 0) return true;
 
-  const size_t bsize = mi_page_block_size(page);
-  const size_t ubsize = mi_page_usable_block_size(page); // without padding
-  size_t   psize;
-  uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  size_t psize;
+  uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  mi_heap_t* const heap = mi_page_heap(page);
+  const size_t bsize    = mi_page_block_size(page);
+  const size_t ubsize   = mi_page_usable_block_size(page); // without padding
 
+  // optimize page with one block
   if (page->capacity == 1) {
-    // optimize page with one block
     mi_assert_internal(page->used == 1 && page->free == NULL);
     return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
   }
+  mi_assert(bsize <= UINT32_MAX);
+
+  // optimize full pages
+  if (page->used == page->capacity) {
+    uint8_t* block = pstart;
+    for (size_t i = 0; i < page->capacity; i++) {
+      if (!visitor(heap, area, block, ubsize, arg)) return false;
+      block += bsize;
+    }
+    return true;
+  }
 
   // create a bitmap of free blocks.
   #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
-  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
-  memset(free_map, 0, sizeof(free_map));
+  uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS];
+  const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS);    
+  memset(free_map, 0, bmapsize * sizeof(intptr_t));
+  if (page->capacity % MI_INTPTR_BITS != 0) {
+    // mark left-over bits at the end as free
+    size_t shift   = (page->capacity % MI_INTPTR_BITS);
+    uintptr_t mask = (UINTPTR_MAX << shift);
+    free_map[bmapsize - 1] = mask;
+  }
+
+  // fast repeated division by the block size
+  uint64_t magic; 
+  size_t   shift;
+  mi_get_fast_divisor(bsize, &magic, &shift);
 
   #if MI_DEBUG>1
   size_t free_count = 0;
   #endif
-  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) {
     #if MI_DEBUG>1
     free_count++;
     #endif
     mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
     size_t offset = (uint8_t*)block - pstart;
     mi_assert_internal(offset % bsize == 0);
-    size_t blockidx = offset / bsize;  // Todo: avoid division?
-    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
-    size_t bitidx = (blockidx / sizeof(uintptr_t));
-    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    mi_assert_internal(offset <= UINT32_MAX);
+    size_t blockidx = mi_fast_divide(offset, magic, shift);
+    mi_assert_internal(blockidx == offset / bsize);
+    mi_assert_internal(blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / MI_INTPTR_BITS);
+    size_t bit = blockidx - (bitidx * MI_INTPTR_BITS);
     free_map[bitidx] |= ((uintptr_t)1 << bit);
   }
   mi_assert_internal(page->capacity == (free_count + page->used));
@@ -576,19 +613,30 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_
   #if MI_DEBUG>1
   size_t used_count = 0;
   #endif
-  for (size_t i = 0; i < page->capacity; i++) {
-    size_t bitidx = (i / sizeof(uintptr_t));
-    size_t bit = i - (bitidx * sizeof(uintptr_t));
-    uintptr_t m = free_map[bitidx];
-    if (bit == 0 && m == UINTPTR_MAX) {
-      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+  uint8_t* block = pstart;
+  for (size_t i = 0; i < bmapsize; i++) {
+    if (free_map[i] == 0) {
+      // every block is in use
+      for (size_t j = 0; j < MI_INTPTR_BITS; j++) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        if (!visitor(heap, area, block, ubsize, arg)) return false;
+        block += bsize;
+      }
     }
-    else if ((m & ((uintptr_t)1 << bit)) == 0) {
-      #if MI_DEBUG>1
-      used_count++;
-      #endif
-      uint8_t* block = pstart + (i * bsize);
-      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
+    else {
+      // visit the used blocks in the mask
+      uintptr_t m = ~free_map[i];
+      while (m != 0) {
+        #if MI_DEBUG>1
+        used_count++;
+        #endif
+        size_t bitidx = mi_ctz(m);
+        if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false;
+        m &= m - 1;  // clear least significant bit
+      }
+      block += bsize * MI_INTPTR_BITS;
     }
   }
   mi_assert_internal(page->used == used_count);
diff --git a/test/test-stress.c b/test/test-stress.c
index 0368007a..f82b9743 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -129,6 +129,16 @@ static void free_items(void* p) {
   custom_free(p);
 }
 
+/* 
+static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
+  (void)(heap); (void)(area); 
+  size_t* total = (size_t*)arg;
+  if (block != NULL) {
+    total += block_size;
+  }
+  return true;
+}
+*/
 
 static void stress(intptr_t tid) {
   //bench_start_thread();
@@ -173,6 +183,10 @@ static void stress(intptr_t tid) {
       data[data_idx] = q;
     }
   }
+  // walk the heap
+  // size_t total = 0;
+  // mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total);
+
   // free everything that is left
   for (size_t i = 0; i < retain_top; i++) {
     free_items(retained[i]);

From 635d626c82e636e89163da6601dfe1f02a57e4a9 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 10:43:41 -0700
Subject: [PATCH 10/18] fix leak in abandoned block visiting

---
 src/arena.c        |  4 +++-
 test/test-stress.c | 19 +++++++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 913a02a9..801475fd 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -946,7 +946,9 @@ static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag,
   _mi_arena_field_cursor_init(NULL, subproc, &current);
   mi_segment_t* segment;
   while ((segment = _mi_arena_segment_clear_abandoned_next(&current, true /* visit all */)) != NULL) {
-    if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) return false;
+    bool ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
+    _mi_arena_segment_mark_abandoned(segment);
+    if (!ok) return false;
   }
   return true;
 }
diff --git a/test/test-stress.c b/test/test-stress.c
index f82b9743..c3afde9b 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -39,6 +39,10 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a
 
 #define STRESS                // undefine for leak test
 
+#ifndef NDEBUG
+#define HEAP_WALK             // walk the heap objects?
+#endif
+
 static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?
 
@@ -129,7 +133,7 @@ static void free_items(void* p) {
   custom_free(p);
 }
 
-/* 
+#ifdef HEAP_WALK 
 static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) {
   (void)(heap); (void)(area); 
   size_t* total = (size_t*)arg;
@@ -138,7 +142,7 @@ static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void
   }
   return true;
 }
-*/
+#endif
 
 static void stress(intptr_t tid) {
   //bench_start_thread();
@@ -183,9 +187,12 @@ static void stress(intptr_t tid) {
       data[data_idx] = q;
     }
   }
+
+  #ifdef HEAP_WALK
   // walk the heap
-  // size_t total = 0;
-  // mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total);
+  size_t total = 0;
+  mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total);
+  #endif
 
   // free everything that is left
   for (size_t i = 0; i < retain_top; i++) {
@@ -205,6 +212,10 @@ static void test_stress(void) {
   uintptr_t r = rand();
   for (int n = 0; n < ITER; n++) {
     run_os_threads(THREADS, &stress);
+    #ifdef HEAP_WALK
+    size_t total = 0;
+    mi_abandoned_visit_blocks(mi_subproc_main(), -1, true, visit_blocks, &total);
+    #endif
     for (int i = 0; i < TRANSFERS; i++) {
       if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers
         void* p = atomic_exchange_ptr(&transfer[i], NULL);

From 5501f59f6ce044b33149391132c3dd83b964e710 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 13:16:20 -0700
Subject: [PATCH 11/18] only reclaim for exclusive heaps in their associated
 arena

---
 include/mimalloc.h          |  5 +++++
 include/mimalloc/internal.h |  4 ++--
 src/arena.c                 | 35 +++++++++++++++++++++--------------
 src/heap.c                  | 13 +++++++++----
 src/segment.c               |  3 ++-
 test/test-stress.c          |  6 ++++++
 6 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 9fc770cc..0b4b182c 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -300,6 +300,11 @@ mi_decl_export void            mi_subproc_add_current_thread(mi_subproc_id_t sub
 // Experimental: visit abandoned heap areas (from threads that have been terminated)
 mi_decl_export bool   mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
+// Experimental: create a new heap with a specified heap tag. Set `allow_destroy` to false to allow the thread
+// to reclaim abandoned memory (with a compatible heap_tag and arena_id) but in that case `mi_heap_destroy` will
+// fall back to `mi_heap_delete`.
+mi_decl_export mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id);
+
 // deprecated
 mi_decl_export int    mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 89f04103..0b6cf056 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -131,8 +131,8 @@ void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
 
 typedef struct mi_arena_field_cursor_s { // abstract struct
-  mi_arena_id_t  start;
-  int            count;
+  size_t         start;
+  size_t         end;
   size_t         bitmap_idx;
   mi_subproc_t*  subproc;
 } mi_arena_field_cursor_t;
diff --git a/src/arena.c b/src/arena.c
index 801475fd..095c5745 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -850,11 +850,20 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
 // start a cursor at a randomized arena
 void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) {
   mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena));
-  current->count = 0;
-  current->bitmap_idx = 0;  
+  current->bitmap_idx = 0;
   current->subproc = subproc;
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
+    // for a heap that is bound to one arena, only visit that arena
+    current->start = mi_arena_id_index(heap->arena_id);
+    current->end   = current->start + 1;
+  }
+  else {
+    // otherwise visit all starting at a random location
+    current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
+    current->end   = current->start + max_arena;
+  }
+  mi_assert_internal(current->start < max_arena);
 }
 
 static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {  
@@ -884,16 +893,15 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_s
 // this does not set the thread id (so it appears as still abandoned)
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) 
 {
-  const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL;
 
-  int count = previous->count;
   size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
   size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1;
   // visit arena's (from the previous cursor)
-  for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) {
-    mi_arena_id_t arena_idx = previous->start + count;
-    if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around
+  for ( ; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
+    // index wraps around
+    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);    
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
     if (arena != NULL) {
       bool has_lock = false;      
@@ -918,11 +926,9 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
             // pre-check if the bit is set
             size_t mask = ((size_t)1 << bit_idx);
             if mi_unlikely((field & mask) == mask) {
-              const mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
+              previous->bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
+              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, previous->bitmap_idx);
               if (segment != NULL) {
-                previous->bitmap_idx = bitmap_idx;
-                previous->count = count;
                 //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
                 if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
                 return segment;
@@ -935,8 +941,9 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr
     }
   }
   // no more found
+  mi_assert(previous->start == previous->end);
   previous->bitmap_idx = 0;
-  previous->count = 0;
+  previous->start = previous->end = 0;
   return NULL;
 }
 
diff --git a/src/heap.c b/src/heap.c
index be2800c1..0049abc3 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -226,17 +226,22 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool
   heap->tld->heaps = heap;
 }
 
-mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
   if (heap == NULL) return NULL;
-  // don't reclaim abandoned pages or otherwise destroy is unsafe  
-  _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */);
+  mi_assert(heap_tag >= 0 && heap_tag < 256);
+  _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */);
   return heap;
 }
 
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
+  return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id);
+}
+
 mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
-  return mi_heap_new_in_arena(_mi_arena_id_none());
+  // don't reclaim abandoned memory or otherwise destroy is unsafe  
+  return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none());
 }
 
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
diff --git a/src/segment.c b/src/segment.c
index dc82b89d..8fccf18e 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -905,7 +905,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
       if (target_heap == NULL) {
         target_heap = heap;
-        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using tag %u instead)\n", page->heap_tag, heap->tag );
+        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag );
       }
       // associate the heap with this page, and allow heap thread delayed free again.
       mi_page_set_heap(page, target_heap);
@@ -948,6 +948,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
 bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) {
   if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false;  // it is not abandoned
   if (segment->subproc != heap->tld->segments.subproc)  return false;  // only reclaim within the same subprocess
+  if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false;  // don't reclaim between exclusive and non-exclusive arena's
   // don't reclaim more from a `free` call than half the current segments
   // this is to prevent a pure free-ing thread to start owning too many segments
   if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false;
diff --git a/test/test-stress.c b/test/test-stress.c
index c3afde9b..599c6c2e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -255,6 +255,12 @@ static void test_leak(void) {
 #endif
 
 int main(int argc, char** argv) {
+  #ifdef HEAP_WALK
+    mi_option_enable(mi_option_visit_abandoned);    
+  #endif
+  #ifndef NDBEBUG
+    mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
+  #endif
   #ifndef USE_STD_MALLOC
     mi_stats_reset();
   #endif

From a964322a21907206909798771ab90a9ccf27f8d8 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 14:46:59 -0700
Subject: [PATCH 12/18] revise the segment map to only apply to OS allocated
 segments and reduce the .BSS footprint

---
 src/arena.c        |   4 +-
 src/os.c           |  15 ++--
 src/segment-map.c  | 166 +++++++++++++++++++--------------------------
 test/test-stress.c |   2 +-
 4 files changed, 82 insertions(+), 105 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 095c5745..24f1299c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 typedef uintptr_t mi_block_info_t;
 #define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (255)                    // Limited as the reservation exponentially increases (and takes up .bss)
+#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
@@ -735,7 +735,7 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
 bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
diff --git a/src/os.c b/src/os.c
index 88e7fcb3..4babd8da 100644
--- a/src/os.c
+++ b/src/os.c
@@ -157,7 +157,8 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st
   _mi_stat_decrease(&stats->reserved, size);
 }
 
-void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) {
+void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) {
+  if (stats == NULL) stats = &_mi_stats_main;
   if (mi_memkind_is_os(memid.memkind)) {
     size_t csize = _mi_os_good_alloc_size(size);
     void* base = addr;
@@ -171,10 +172,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     // free it
     if (memid.memkind == MI_MEM_OS_HUGE) {
       mi_assert(memid.is_pinned);
-      mi_os_free_huge_os_pages(base, csize, tld_stats);
+      mi_os_free_huge_os_pages(base, csize, stats);
     }
     else {
-      mi_os_prim_free(base, csize, still_committed, tld_stats);
+      mi_os_prim_free(base, csize, still_committed, stats);
     }
   }
   else {
@@ -183,8 +184,9 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
   }
 }
 
-void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) {
-  _mi_os_free_ex(p, size, true, memid, tld_stats);
+void  _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) {
+  if (stats == NULL) stats = &_mi_stats_main;
+  _mi_os_free_ex(p, size, true, memid, stats);
 }
 
 
@@ -299,6 +301,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
 void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
+  if (stats == NULL) stats = &_mi_stats_main;
   size = _mi_os_good_alloc_size(size);
   bool os_is_large = false;
   bool os_is_zero  = false;
@@ -314,6 +317,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings
   *memid = _mi_memid_none();
   if (size == 0) return NULL;
+  if (stats == NULL) stats = &_mi_stats_main;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
 
@@ -342,6 +346,7 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
+  if (stats == NULL) stats = &_mi_stats_main;
   if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
diff --git a/src/segment-map.c b/src/segment-map.c
index 1efb1e23..be461d7e 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -16,140 +16,112 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc/internal.h"
 #include "mimalloc/atomic.h"
 
-#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN
-#define MI_MAX_ADDRESS    ((size_t)140 << 40) // 140TB (see issue #881)
-#elif (MI_INTPTR_SIZE >= 8)
-#define MI_MAX_ADDRESS    ((size_t)40 << 40)  // 40TB (to include huge page areas)
+// Reduce total address space to reduce .bss  (due to the `mi_segment_map`)
+#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (128*1024ULL*MI_GiB)  // 128 TiB  (see issue #881)
+#elif (MI_INTPTR_SIZE > 4)
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (48*1024ULL*MI_GiB)   // 48 TiB
 #else
-#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#define MI_SEGMENT_MAP_MAX_ADDRESS    (MAX_UINT32)
 #endif
 
-#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
-#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+#define MI_SEGMENT_MAP_PART_SIZE      (MI_INTPTR_SIZE*MI_KiB - 128)      // 128 > sizeof(mi_memid_t) ! 
+#define MI_SEGMENT_MAP_PART_BITS      (8*MI_SEGMENT_MAP_PART_SIZE)
+#define MI_SEGMENT_MAP_PART_ENTRIES   (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE)
+#define MI_SEGMENT_MAP_PART_BIT_SPAN  (MI_SEGMENT_ALIGN)
+#define MI_SEGMENT_MAP_PART_SPAN      (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN)
+#define MI_SEGMENT_MAP_MAX_PARTS      ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1)
 
-static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+// A part of the segment map.
+typedef struct mi_segmap_part_s {
+  mi_memid_t memid;
+  _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES];
+} mi_segmap_part_t;
 
-static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+// Allocate parts on-demand to reduce .bss footprint
+_Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
+
+
+static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
   // note: segment can be invalid or NULL.
   mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
-  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
-    *bitidx = 0;
-    return MI_SEGMENT_MAP_WSIZE;
-  }
-  else {
-    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
-    *bitidx = segindex % MI_INTPTR_BITS;
-    const size_t mapindex = segindex / MI_INTPTR_BITS;
-    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
-    return mapindex;
+  *idx = 0;
+  *bitidx = 0;  
+  if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
+  const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
+  if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
+  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t*, &mi_segment_map[segindex]);
+
+  // allocate on demand to reduce .bss footprint
+  if (part == NULL) {
+    if (!create_on_demand) return NULL;
+    mi_memid_t memid;
+    part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid, NULL);
+    if (part == NULL) return NULL;
+    mi_segmap_part_t* expected = NULL;
+    if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) {
+      _mi_os_free(part, sizeof(mi_segmap_part_t), memid, NULL);
+      part = expected;
+      if (part == NULL) return NULL;
+    }
   }
+  mi_assert(part != NULL);
+  const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN;
+  const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN;
+  *idx = bitofs / MI_INTPTR_BITS;
+  *bitidx = bitofs % MI_INTPTR_BITS;
+  return part;
 }
 
 void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index==MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx);
+  if (part == NULL) return; // outside our address range..
+  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   uintptr_t newmask;
   do {
     newmask = (mask | ((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 
 void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  if (segment->memid.memkind == MI_MEM_ARENA) return;
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
-  if (index == MI_SEGMENT_MAP_WSIZE) return;
-  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx);
+  if (part == NULL) return; // outside our address range..
+  uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   uintptr_t newmask;
   do {
     newmask = (mask & ~((uintptr_t)1 << bitidx));
-  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+  } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask));
 }
 
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
   if (p == NULL) return NULL;
   mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
+  size_t index;
   size_t bitidx;
-  size_t index = mi_segment_map_index_of(segment, &bitidx);
-  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
-  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx);
+  if (part == NULL) return NULL;  
+  const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]);
   if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+    mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok);
     return segment; // yes, allocated by us
   }
-  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
-
-  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
-
-  // search downwards for the first segment in case it is an interior pointer
-  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
-  // valid huge objects
-  // note: we could maintain a lowest index to speed up the path for invalid pointers?
-  size_t lobitidx;
-  size_t loindex;
-  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
-  if (lobits != 0) {
-    loindex = index;
-    lobitidx = mi_bsr(lobits);    // lobits != 0
-  }
-  else if (index == 0) {
-    return NULL;
-  }
-  else {
-    mi_assert_internal(index > 0);
-    uintptr_t lomask = mask;
-    loindex = index;
-    do {
-      loindex--;  
-      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);      
-    } while (lomask != 0 && loindex > 0);
-    if (lomask == 0) return NULL;
-    lobitidx = mi_bsr(lomask);    // lomask != 0
-  }
-  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
-  // take difference as the addresses could be larger than the MAX_ADDRESS space.
-  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
-  segment = (mi_segment_t*)((uint8_t*)segment - diff);
-
-  if (segment == NULL) return NULL;
-  mi_assert_internal((void*)segment < p);
-  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(cookie_ok);
-  if mi_unlikely(!cookie_ok) return NULL;
-  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
-  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
-  return segment;
+  return NULL;
 }
 
 // Is this a valid pointer in our heap?
-static bool  mi_is_valid_pointer(const void* p) {
-  return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p)));
+static bool mi_is_valid_pointer(const void* p) {
+  // first check if it is in an arena, then check if it is OS allocated
+  return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL);
 }
 
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   return mi_is_valid_pointer(p);
 }
-
-/*
-// Return the full segment range belonging to a pointer
-static void* mi_segment_range_of(const void* p, size_t* size) {
-  mi_segment_t* segment = _mi_segment_of(p);
-  if (segment == NULL) {
-    if (size != NULL) *size = 0;
-    return NULL;
-  }
-  else {
-    if (size != NULL) *size = segment->segment_size;
-    return segment;
-  }
-  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
-  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
-  mi_reset_delayed(tld);
-  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
-  return page;
-}
-*/
diff --git a/test/test-stress.c b/test/test-stress.c
index 599c6c2e..24dcf00f 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -258,7 +258,7 @@ int main(int argc, char** argv) {
   #ifdef HEAP_WALK
     mi_option_enable(mi_option_visit_abandoned);    
   #endif
-  #ifndef NDBEBUG
+  #ifndef NDEBUG
     mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */);
   #endif
   #ifndef USE_STD_MALLOC

From e8f4bdd1ea568b723da5c8362d5cdb092fa4cbc2 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 14:59:37 -0700
Subject: [PATCH 13/18] fix cast; make segment map static

---
 src/segment-map.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/segment-map.c b/src/segment-map.c
index be461d7e..8927a8bd 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -39,8 +39,7 @@ typedef struct mi_segmap_part_s {
 } mi_segmap_part_t;
 
 // Allocate parts on-demand to reduce .bss footprint
-_Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
-
+static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. }
 
 static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) {
   // note: segment can be invalid or NULL.
@@ -50,7 +49,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo
   if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL;
   const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN;
   if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL;
-  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t*, &mi_segment_map[segindex]);
+  mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]);
 
   // allocate on demand to reduce .bss footprint
   if (part == NULL) {

From f87ec74bb3103f68f4e8b6f34098e09cbd1b306d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 15:10:17 -0700
Subject: [PATCH 14/18] reduce delayed output from redirection to 16KiB to
 reduce the .bss size

---
 src/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/options.c b/src/options.c
index 32fa212b..462a7c71 100644
--- a/src/options.c
+++ b/src/options.c
@@ -200,7 +200,7 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024))
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(size_t) out_len;

From f9076a5cf83a4326cee17e70c0b11baa056a5e57 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 15:54:49 -0700
Subject: [PATCH 15/18] use EFAULT if a target heap tag cannot be found on
 reclaim

---
 src/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/segment.c b/src/segment.c
index 8fccf18e..e484a38f 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -905,7 +905,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
       mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag);  // allow custom heaps to separate objects
       if (target_heap == NULL) {
         target_heap = heap;
-        _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag );
+        _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag );
       }
       // associate the heap with this page, and allow heap thread delayed free again.
       mi_page_set_heap(page, target_heap);

From e4c8f42bb6b4169ff329393621474612f4cce4f5 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 2 Jun 2024 16:10:08 -0700
Subject: [PATCH 16/18] always include sys/prctl.h on linux to disable THP if
 large_os_pages are not enabled

---
 src/prim/unix/prim.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 93785b22..63a36f25 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -30,9 +30,9 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #if defined(__linux__)
   #include <features.h>
-  #if defined(MI_NO_THP)
-  #include <sys/prctl.h>
-  #endif
+  //#if defined(MI_NO_THP)
+  #include <sys/prctl.h>  // THP disable
+  //#endif
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else

From 768872e4e0bdec168fe82358614e4dfbfde1c779 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 2 Jun 2024 16:24:13 -0700
Subject: [PATCH 17/18] typo in stress test

---
 test/test-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test-stress.c b/test/test-stress.c
index 24dcf00f..544c2838 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -138,7 +138,7 @@ static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void
   (void)(heap); (void)(area); 
   size_t* total = (size_t*)arg;
   if (block != NULL) {
-    total += block_size;
+    *total += block_size;
   }
   return true;
 }

From 6b15342709241f278256d0392926752b130b9d5e Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sun, 2 Jun 2024 16:41:07 -0700
Subject: [PATCH 18/18] fix pthread initalization of mutexes

---
 include/mimalloc/atomic.h | 7 ++++---
 src/arena.c               | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index d2711019..3a0d4892 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -441,13 +441,13 @@ static inline void mi_lock_release(mi_lock_t* lock) {
   pthread_mutex_unlock(lock);
 }
 static inline void mi_lock_init(mi_lock_t* lock) {
-  (void)(lock);
+  pthread_mutex_init(lock, NULL);
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
-  (void)(lock);
+  pthread_mutex_destroy(lock);
 }
 
-
+/*
 #elif defined(__cplusplus)
 
 #include <mutex>
@@ -469,6 +469,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
+*/
 
 #else
 
diff --git a/src/arena.c b/src/arena.c
index 24f1299c..7d7eb089 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -863,7 +863,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren
     current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
     current->end   = current->start + max_arena;
   }
-  mi_assert_internal(current->start < max_arena);
+  mi_assert_internal(current->start <= max_arena);
 }
 
 static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {