From f3d83e5fa62f9d5ec653d13db8eec2d814e72046 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Fri, 20 Dec 2024 13:55:31 -0800
Subject: [PATCH 01/16] insert full pages at the end of the queue; only
 override page candidate if the page is not too full

---
 ide/vs2022/mimalloc-test.vcxproj | 6 +++---
 include/mimalloc/internal.h      | 2 +-
 src/page-queue.c                 | 2 +-
 src/page.c                       | 3 ++-
 test/test-stress.c               | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/ide/vs2022/mimalloc-test.vcxproj b/ide/vs2022/mimalloc-test.vcxproj
index a8b36d5e..6e4576fd 100644
--- a/ide/vs2022/mimalloc-test.vcxproj
+++ b/ide/vs2022/mimalloc-test.vcxproj
@@ -272,14 +272,14 @@
       <SubSystem>Console</SubSystem>
     </Link>
   </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\test\main-override-static.c" />
+  </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="mimalloc.vcxproj">
       <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\test\main-override-static.c" />
-  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 012ce4f0..8b22e1c6 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -571,7 +571,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
 }
 
 // is more than 7/8th of a page in use?
-static inline bool mi_page_mostly_used(const mi_page_t* page) {
+static inline bool mi_page_is_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
   uint16_t frac = page->reserved / 8U;
   return (page->reserved - page->used <= frac);
diff --git a/src/page-queue.c b/src/page-queue.c
index 9796f3dc..67b54650 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -343,7 +343,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
 
 static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
   // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`)
-  mi_page_queue_enqueue_from_ex(to, from, false /* enqueue at the end of the `to` queue? */, page);
+  mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page);
 }
 
 // Only called from `mi_heap_absorb`.
diff --git a/src/page.c b/src/page.c
index 4b25ed5d..8808c358 100644
--- a/src/page.c
+++ b/src/page.c
@@ -783,7 +783,8 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
         page_candidate = page;
         candidate_count = 0;
       }
-      else if (!mi_page_mostly_used(page) && page->used >= page_candidate->used) {
+      // prefer to reuse fuller pages (in the hope the less used page gets freed)
+      else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) {
         page_candidate = page;
       }
       // if we find a non-expandable candidate, or searched for N pages, return with the best candidate
diff --git a/test/test-stress.c b/test/test-stress.c
index 574d241b..6284ad39 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -319,7 +319,7 @@ int main(int argc, char** argv) {
   mi_collect(true);
   #endif  
 #endif
-  //mi_stats_print(NULL);
+  mi_stats_print(NULL);
   //bench_end_program();
   return 0;
 }

From 7141d9f1642ff24f5d94e5ae3767f3212153f25f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 17:31:48 -0800
Subject: [PATCH 02/16] remove busy wait for arena reservation

---
 src/arena.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 44c909c1..74cd4977 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -453,7 +453,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   void* p;
-again:
+
   // try to find free slices in the arena's
   p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
   if (p != NULL) return p;
@@ -465,22 +465,25 @@ again:
   if (_mi_preloading()) return NULL;
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
-  if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
-    mi_arena_id_t arena_id = 0;
-    bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+  const size_t arena_count = mi_arena_get_count();
+  if (mi_lock_acquire(&mi_arena_reserve_lock)) {
+    bool ok = true;
+    if (arena_count == mi_arena_get_count()) {
+      // we are the first to enter the lock, reserve a fresh arena
+      mi_arena_id_t arena_id = 0;
+      ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+    }
+    else {
+      // another thread already reserved a new arena
+    }
     mi_lock_release(&mi_arena_reserve_lock);
     if (ok) {
-      // and try allocate in there
+      // try once more to allocate in the new arena
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
       p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
       if (p != NULL) return p;
     }
   }
-  else {
-    // if we are racing with another thread wait until the new arena is reserved (todo: a better yield?)
-    mi_atomic_yield();
-    goto again;
-  }
 
   return NULL;
 }

From 93e14344c7be10f186a39f7bee998db8adcead9b Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 17:32:26 -0800
Subject: [PATCH 03/16] use srw lock on windows

---
 include/mimalloc/atomic.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 3a0d4892..0c967896 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -402,28 +402,34 @@ static inline void mi_atomic_yield(void) {
 
 
 // ----------------------------------------------------------------------
-// Locks are only used for abandoned segment visiting in `arena.c`
+// Locks 
+// These do not have to be recursive and should be light-weight 
+// in-process only locks. Only used for reserving arena's and to 
+// maintain the abandoned list.
 // ----------------------------------------------------------------------
+#if _MSC_VER
+#pragma warning(disable:26110)  // unlock with holding lock
+#endif
 
 #if defined(_WIN32)
 
-#define mi_lock_t  CRITICAL_SECTION
+#define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
-  return TryEnterCriticalSection(lock);
+  return TryAcquireSRWLockExclusive(lock);
 }
 static inline bool mi_lock_acquire(mi_lock_t* lock) {
-  EnterCriticalSection(lock);
+  AcquireSRWLockExclusive(lock);
   return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
-  LeaveCriticalSection(lock);
+  ReleaseSRWLockExclusive(lock);
 }
 static inline void mi_lock_init(mi_lock_t* lock) {
-  InitializeCriticalSection(lock);
+  InitializeSRWLock(lock);
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
-  DeleteCriticalSection(lock);
+  // nothing
 }
 
 
@@ -447,14 +453,13 @@ static inline void mi_lock_done(mi_lock_t* lock) {
   pthread_mutex_destroy(lock);
 }
 
-/*
 #elif defined(__cplusplus)
 
 #include <mutex>
 #define mi_lock_t  std::mutex
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
-  return lock->lock_try_acquire();
+  return lock->try_lock();
 }
 static inline bool mi_lock_acquire(mi_lock_t* lock) {
   lock->lock();
@@ -469,7 +474,6 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
-*/
 
 #else
 

From a5b7d7f26461d0d241b6de41f215d63dbfa642cb Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 21:38:31 -0800
Subject: [PATCH 04/16] subprocesses own arena's

---
 include/mimalloc.h          |   2 +-
 include/mimalloc/atomic.h   |   2 +-
 include/mimalloc/internal.h |  15 +-
 include/mimalloc/types.h    |  56 +++----
 src/alloc.c                 |   4 +-
 src/arena-meta.c            |   6 +-
 src/arena.c                 | 315 +++++++++++++++++-------------------
 src/bitmap.c                |   7 +-
 src/bitmap.h                |   4 +-
 src/free.c                  |   6 +-
 src/heap.c                  |   7 +-
 src/init.c                  | 259 ++++++++++++++++-------------
 src/page.c                  |   2 +-
 13 files changed, 351 insertions(+), 334 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 24217fae..7a58e54c 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -279,7 +279,7 @@ mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_commit
 mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept;
 
 // Experimental: heaps associated with specific memory arena's
-typedef int mi_arena_id_t;
+typedef void* mi_arena_id_t;
 mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
 mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
 mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 7dc492f6..ddb5a9a3 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -434,7 +434,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
   InitializeSRWLock(lock);
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
-  // nothing
+  (void)(lock);
 }
 
 
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index a5ca3e27..24792f8c 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -101,8 +101,10 @@ bool        _mi_is_main_thread(void);
 size_t      _mi_current_thread_count(void);
 bool        _mi_preloading(void);           // true while the C runtime is not initialized yet
 void        _mi_thread_done(mi_heap_t* heap);
-mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 
+mi_tld_t*   _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
+mi_subproc_t* _mi_subproc(void);
+mi_subproc_t* _mi_subproc_main(void);
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
 size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
 
@@ -142,10 +144,11 @@ void*       _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t m
 
 // arena.c
 mi_arena_id_t _mi_arena_id_none(void);
-void        _mi_arena_init(void);
-void*       _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
-void*       _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid);
-bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id);
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
+
+void*       _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void*       _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 bool        _mi_arena_contains(const void* p);
 void        _mi_arenas_collect(bool force_purge);
 void        _mi_arena_unsafe_destroy_all(void);
@@ -524,7 +527,7 @@ static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   if (heap != NULL) {
     page->heap = heap;
     page->heap_tag = heap->tag;
-    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+    mi_atomic_store_release(&page->xthread_id, heap->tld->thread_id);
   }
   else {
     page->heap = NULL;
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 0cf909d0..4d43e887 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -243,9 +243,6 @@ typedef size_t mi_page_flags_t;
 // atomically in `free.c:mi_free_block_mt`.
 typedef uintptr_t mi_thread_free_t;
 
-// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
-typedef struct mi_subproc_s mi_subproc_t;
-
 // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython)
 typedef uint8_t mi_heaptag_t;
 
@@ -299,7 +296,6 @@ typedef struct mi_page_s {
   mi_heap_t*                heap;              // heap this threads belong to.
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
-  mi_subproc_t*             subproc;           // sub-process of this heap
   mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
@@ -380,7 +376,7 @@ typedef struct mi_random_cxt_s {
 
 
 // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
-#if (MI_PADDING)
+#if MI_PADDING
 typedef struct mi_padding_s {
   uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
   uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
@@ -397,10 +393,8 @@ typedef struct mi_padding_s {
 
 // A heap owns a set of pages.
 struct mi_heap_s {
-  mi_tld_t*             tld;
-  // _Atomic(mi_block_t*)  thread_delayed_free;
-  mi_threadid_t         thread_id;                           // thread this heap belongs too
-  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  mi_tld_t*             tld;                                 // thread-local data
+  mi_arena_t*           exclusive_arena;                     // if the heap belongs to a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -408,7 +402,6 @@ struct mi_heap_s {
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
   size_t                page_retired_max;                    // largest retired index into the `pages` array.
   mi_heap_t*            next;                                // list of heaps per thread
-  mi_memid_t            memid;                               // provenance of the heap struct itseft (meta or os)
   long                  full_page_retain;                    // how many full pages can be retained per queue (before abondoning them)
   bool                  allow_page_reclaim;                  // `true` if this heap should not reclaim abandoned pages
   bool                  allow_page_abandon;                  // `true` if this heap can abandon pages to reduce memory footprint
@@ -421,7 +414,8 @@ struct mi_heap_s {
   size_t                guarded_sample_count;                // current sample count (counting down to 0)
   #endif
   mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
-  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  mi_page_queue_t       pages[MI_BIN_COUNT];                 // queue of pages for each size class (or "bin")
+  mi_memid_t            memid;                               // provenance of the heap struct itself (meta or os)
 };
 
 
@@ -479,7 +473,7 @@ typedef struct mi_stats_s {
   mi_stat_counter_t arena_count;
   mi_stat_counter_t guarded_alloc_count;
 #if MI_STAT>1
-  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+  mi_stat_count_t normal_bins[MI_BIN_COUNT];
 #endif
 } mi_stats_t;
 
@@ -513,19 +507,24 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 
 
 // ------------------------------------------------------
-// Sub processes do not reclaim or visit segments
-// from other sub processes
+// Sub processes use separate arena's and no heaps/pages/blocks
+// are shared between sub processes. 
+// Each thread should also belong to one sub-process only
 // ------------------------------------------------------
 
-struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
-  mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_page_t*         abandoned_os_list;       // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory)
-  mi_page_t*         abandoned_os_list_tail;  // the tail-end of the list
-  mi_memid_t         memid;                   // provenance of this memory block
-};
+#define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
+                                // 160 arenas is enough for ~2 TiB memory
+
+typedef struct mi_subproc_s {
+  _Atomic(size_t)       arena_count;                    // current count of arena's
+  _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
+  mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+  mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+} mi_subproc_t;
+
 
 // ------------------------------------------------------
 // Thread Local data
@@ -534,20 +533,21 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 
-
 // Thread local data
 struct mi_tld_s {
-  unsigned long long  heartbeat;        // monotonic heartbeat count
+  mi_threadid_t       thread_id;        // thread id of this thread
+  size_t              thread_seq;       // thread sequence id (linear count of created threads)
+  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
   mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
-  size_t              tseq;             // thread sequence id
-  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
+  unsigned long long  heartbeat;        // monotonic heartbeat count
   bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
   bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
   mi_stats_t          stats;            // statistics
+  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
 };
 
+
 /* -----------------------------------------------------------
   Error codes passed to `_mi_fatal_error`
   All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
diff --git a/src/alloc.c b/src/alloc.c
index 25d6f62e..e5f2b8ae 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -134,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap,
   mi_assert(size <= MI_SMALL_SIZE_MAX);
   #if MI_DEBUG
   const uintptr_t tid = _mi_thread_id();
-  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local
   #endif
   #if (MI_PADDING || MI_GUARDED)
   if (size == 0) { size = sizeof(void*); }
@@ -188,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z
   else {
     // regular allocation
     mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id());   // heaps are thread local
     void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
     mi_track_malloc(p,size,zero);
 
diff --git a/src/arena-meta.c b/src/arena-meta.c
index ceda06ba..f28c50e9 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -64,10 +64,12 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
+  // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the 
+  // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet).
   mi_memid_t memid;
-  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
+  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
                                                                    true /* commit*/, true /* allow large */,
-                                                                   _mi_arena_id_none(), 0 /* tseq */, &memid );
+                                                                   NULL, 0 /* tseq */, &memid );
   if (mpage == NULL) return NULL;
   mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
diff --git a/src/arena.c b/src/arena.c
index 74cd4977..bb846da9 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -35,7 +35,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_memid_t          memid;                // memid of the memory area
-  mi_arena_id_t       id;                   // arena id (> 0 where `arena == arenas[arena->id - 1]`)
+  mi_subproc_t*       subproc;              // subprocess this arena belongs to (`this 'in' this->subproc->arenas`)
 
   size_t              slice_count;          // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`)
   size_t              info_slices;          // initial slices reserved for the arena bitmaps
@@ -64,64 +64,45 @@ typedef struct mi_purge_info_s {
 } mi_purge_info_t;
 
 
-#define MI_MAX_ARENAS         (160)         // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
-                                            // 160 arenas is enough for ~2 TiB memory
-
-// The available arenas
-static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
-static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
-
-
-static mi_lock_t mi_arena_reserve_lock;
-
-void _mi_arena_init(void) {
-  mi_lock_init(&mi_arena_reserve_lock);
-}
 
 /* -----------------------------------------------------------
   Arena id's
-  id = arena_index + 1
 ----------------------------------------------------------- */
 
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
+static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) {
+  return arena;
 }
 
 mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
+  return NULL;
 }
 
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
+mi_arena_t* _mi_arena_from_id(mi_arena_id_t id) {
+  return (mi_arena_t*)id;
 }
 
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+
+static bool mi_arena_id_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena) {
+  return ((arena == req_arena) ||                        // they match, 
+          (req_arena == NULL && !arena->is_exclusive));  // or the arena is not exclusive, and we didn't request a specific one
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena) {
   if (memid.memkind == MI_MEM_ARENA) {
-    const mi_arena_t* arena = memid.mem.arena.arena;
-    return mi_arena_id_is_suitable(arena->id, arena->is_exclusive, request_arena_id);
+    return mi_arena_id_is_suitable(memid.mem.arena.arena, request_arena);
   }
   else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+    return mi_arena_id_is_suitable(NULL, request_arena);
   }
 }
 
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
+size_t mi_arenas_get_count(mi_subproc_t* subproc) {
+  return mi_atomic_load_relaxed(&subproc->arena_count);
 }
 
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
-}
-
-mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
-  return mi_arena_from_index(mi_arena_id_index(id));
+mi_arena_t* mi_arena_from_index(mi_subproc_t* subproc, size_t idx) {
+  mi_assert_internal(idx < mi_arenas_get_count(subproc));
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &subproc->arenas[idx]);
 }
 
 static size_t mi_arena_info_slices(mi_arena_t* arena) {
@@ -159,9 +140,7 @@ uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) {
 // Arena area
 void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (size != NULL) *size = 0;
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
   if (arena == NULL) return NULL;
   if (size != NULL) { *size = mi_size_of_slices(arena->slice_count); }
   return mi_arena_start(arena);
@@ -297,12 +276,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
   // if (_mi_preloading()) return false;  // use OS only while pre loading
   if (req_arena_id != _mi_arena_id_none()) return false;
 
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  const size_t arena_count = mi_arenas_get_count(subproc);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
   // calc reserve
@@ -368,32 +347,27 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   Arena iteration
 ----------------------------------------------------------- */
 
-static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, int numa_node, bool allow_large) {
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_large) {
   if (!allow_large && arena->is_large) return false;
-  if (!mi_arena_id_is_suitable(arena->id, arena->is_exclusive, req_arena_id)) return false;
-  if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity
+  if (!mi_arena_id_is_suitable(arena, req_arena)) return false;
+  if (req_arena == NULL) { // if not specific, check numa affinity
     const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
     if (!numa_suitable) return false;
   }
   return true;
 }
 
-
-#define mi_forall_arenas(req_arena_id, tseq, name_arena) \
-  { \
-  const size_t _arena_count = mi_arena_get_count(); \
-  if (_arena_count > 0) { \
-    const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
-    size_t _start; \
-    if (req_arena_id == _mi_arena_id_none()) { \
-       /* always start searching in the arena's below the max */ \
-      _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
+#define mi_forall_arenas(subproc, req_arena, tseq, name_arena) { \
+  const size_t _arena_count = mi_arenas_get_count(subproc); \
+  const size_t _arena_cycle = (_arena_count == 0 ? 0 : _arena_count - 1); /* first search the arenas below the last one */ \
+  /* always start searching in the arena's below the max */ \
+  size_t _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
+  for (size_t _i = 0; _i < _arena_count; _i++) { \
+    mi_arena_t* name_arena; \
+    if (req_arena != NULL) { \
+      name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
     } \
     else { \
-      _start = mi_arena_id_index(req_arena_id); \
-      mi_assert_internal(_start < _arena_count); \
-    } \
-    for (size_t _i = 0; _i < _arena_count; _i++) { \
       size_t _idx; \
       if (_i < _arena_cycle) { \
         _idx = _i + _start; \
@@ -402,19 +376,20 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
       else { \
         _idx = _i; /* remaining arena's */ \
       } \
-      mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
-      if (name_arena != NULL) \
-      {
+      name_arena = mi_arena_from_index(subproc,_idx); \
+    } \
+    if (name_arena != NULL) \
+    {
 
 #define mi_forall_arenas_end()  \
-      } \
-      if (req_arena_id != _mi_arena_id_none()) break; \
     } \
-  }}
+    if (req_arena != NULL) break; \
+  } \
+  }
 
-#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \
-  mi_forall_arenas(req_arena_id,tseq,name_arena) { \
-    if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \
+#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \
+  mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \
+    if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \
 
 #define mi_forall_suitable_arenas_end() \
   }} \
@@ -425,17 +400,16 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 ----------------------------------------------------------- */
 
 // allocate slices from the arenas
-static mi_decl_noinline void* mi_arena_try_find_free(
-  size_t slice_count, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+static mi_decl_noinline void* mi_arenas_try_find_free(
+  mi_subproc_t* subproc, size_t slice_count, size_t alignment,
+  bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
 
   // search arena's
-  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
   {
     void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
     if (p != NULL) return p;
@@ -445,42 +419,43 @@ static mi_decl_noinline void* mi_arena_try_find_free(
 }
 
 // Allocate slices from the arena's -- potentially allocating a fresh arena
-static mi_decl_noinline void* mi_arena_try_alloc(
+static mi_decl_noinline void* mi_arenas_try_alloc(
+  mi_subproc_t* subproc,
   size_t slice_count, size_t alignment,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
   mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
   void* p;
 
   // try to find free slices in the arena's
-  p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
   if (p != NULL) return p;
 
   // did we need a specific arena?
-  if (req_arena_id != _mi_arena_id_none()) return NULL;
+  if (req_arena != NULL) return NULL;
 
   // don't create arena's while preloading (todo: or should we?)
   if (_mi_preloading()) return NULL;
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
-  const size_t arena_count = mi_arena_get_count();
-  if (mi_lock_acquire(&mi_arena_reserve_lock)) {
+  const size_t arena_count = mi_arenas_get_count(subproc);
+  if (mi_lock_acquire(&subproc->arena_reserve_lock)) {
     bool ok = true;
-    if (arena_count == mi_arena_get_count()) {
+    if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
+      ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
     }
     else {
       // another thread already reserved a new arena
     }
-    mi_lock_release(&mi_arena_reserve_lock);
+    mi_lock_release(&subproc->arena_reserve_lock);
     if (ok) {
       // try once more to allocate in the new arena
-      mi_assert_internal(req_arena_id == _mi_arena_id_none());
-      p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+      mi_assert_internal(req_arena == NULL);
+      p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
       if (p != NULL) return p;
     }
   }
@@ -510,10 +485,10 @@ static void* mi_arena_os_alloc_aligned(
 
 
 // Allocate large sized memory
-void* _mi_arena_alloc_aligned(
+void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
   size_t size, size_t alignment, size_t align_offset,
   bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+  mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
   mi_assert_internal(memid != NULL);
   mi_assert_internal(size > 0);
@@ -522,24 +497,24 @@ void* _mi_arena_alloc_aligned(
   // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
-      req_arena_id == _mi_arena_id_none() &&                   // not a specific arena?
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&           // is arena allocation allowed?
+      req_arena == NULL &&                                               // not a specific arena?
       size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
       alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
     const size_t slice_count = mi_slice_count_of_size(size);
-    void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid);
+    void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
     if (p != NULL) return p;
   }
 
   // fall back to the OS
-  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid);
+  void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena, memid);
   return p;
 }
 
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid)
+void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, tseq, memid);
+  return _mi_arena_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
 }
 
 
@@ -548,7 +523,7 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
-static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_abandoned) {
+static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_abandoned) {
   // found an abandoned page of the right size
   mi_page_t* const page  = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
@@ -560,9 +535,9 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
     *keep_abandoned = true;
     return false;
   }
-  if (subproc != page->subproc || heap_tag != page->heap_tag) {
-    // wrong sub-process or heap_tag.. we need to unown again
-    // note: this normally never happens unless subprocesses/heaptags are actually used.
+  if (heap_tag != page->heap_tag) {
+    // wrong heap_tag.. we need to unown again
+    // note: this normally never happens unless heaptags are actually used.
     // (an unown might free the page, and depending on that we can keep it in the abandoned map or not)
     // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point.
     //       so we cannot check in `mi_arena_free` for this invariant to hold.
@@ -570,31 +545,31 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena,
     *keep_abandoned = !freed;
     return false;
   }
-  // yes, we can reclaim it, keep the abandaned map entry clear
+  // yes, we can reclaim it, keep the abandoned map entry clear
   *keep_abandoned = false;
   return true;
 }
 
-static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_heaptag_t heaptag, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq)
 {
   MI_UNUSED(slice_count);
   const size_t bin = _mi_bin(block_size);
   mi_assert_internal(bin < MI_BIN_COUNT);
 
   // any abandoned in our size class?
-  mi_subproc_t* const subproc = tld->subproc;
   mi_assert_internal(subproc != NULL);
-  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL;
+  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) {
+    return NULL;
+  }
 
   // search arena's
   const bool allow_large = true;
-  size_t tseq = tld->tseq;
-  mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
+  mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
   {
     size_t slice_index;
     mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
 
-    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc, heaptag)) {
+    if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, heaptag)) {
       // found an abandoned page of the right size
       // and claimed ownership.
       mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
@@ -621,8 +596,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   return NULL;
 }
 
-static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment,
-                                            mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
+                                            mi_arena_t* req_arena, size_t tseq)
 {
   const bool allow_large = true;
   const bool commit = true;
@@ -636,7 +611,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
       !os_align &&                            // not large alignment
       slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
   {
-    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid);
+    page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid);
     if (page != NULL) {
       mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
       mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index);
@@ -648,10 +623,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     if (os_align) {
       // note: slice_count already includes the page
       mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid);
     }
     else {
-      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid);
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid);
     }
   }
 
@@ -724,17 +699,17 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
 }
 
 static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) {
-  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
 
   // 1. look for an abandoned page
-  mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, heap->tag, tld);
+  mi_page_t* page = mi_arena_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq);
   if (page != NULL) {
     return page;  // return as abandoned
   }
 
   // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld);
+  page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq);
   if (page != NULL) {
     mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
     _mi_page_init(heap, page);
@@ -746,13 +721,13 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
 
 
 static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
-  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_arena_t* req_arena = heap->exclusive_arena;
   mi_tld_t* const tld = heap->tld;
   const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
   const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size());
   const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
 
-  mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);
+  mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq);
   if (page == NULL) return NULL;
 
   mi_assert(page != NULL);
@@ -836,7 +811,6 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   mi_assert_internal(!mi_page_all_free(page));
   mi_assert_internal(page->next==NULL);
 
-  mi_subproc_t* subproc = page->subproc;
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     // make available for allocations
     size_t bin = _mi_bin(mi_page_block_size(page));
@@ -851,7 +825,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_page_set_abandoned_mapped(page);
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]);
+    mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
@@ -902,7 +876,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
-    mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
+    mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
   }
   else {
     // page is full (or a singleton), page is OS/nly allocated
@@ -989,9 +963,10 @@ void _mi_arenas_collect(bool force_purge) {
 
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_arena_get_count();
+  mi_subproc_t* subproc = _mi_subproc();
+  const size_t max_arena = mi_arenas_get_count(subproc);
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) {
       return true;
     }
@@ -1007,14 +982,14 @@ bool _mi_arena_contains(const void* p) {
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_arena_get_count();
+static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
+  const size_t max_arena = mi_arenas_get_count(subproc);
   size_t new_max_arena = 0;
   for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena != NULL) {
       // mi_lock_done(&arena->abandoned_visit_lock);
-      mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
       if (mi_memkind_is_os(arena->memid.memkind)) {
         _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid);
       }
@@ -1023,14 +998,14 @@ static void mi_arenas_unsafe_destroy(void) {
 
   // try to lower the max arena.
   size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+  mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, new_max_arena);
 }
 
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(void) {
-  mi_arenas_unsafe_destroy();
+  mi_arenas_unsafe_destroy(_mi_subproc());
   _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
 }
 
@@ -1039,40 +1014,36 @@ void _mi_arena_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
+  if (arena_id != NULL) { *arena_id = NULL; }
 
   // first try to find a NULL entry
-  const size_t count = mi_arena_get_count();
+  const size_t count = mi_arenas_get_count(subproc);
   size_t i;
   for (i = 0; i < count; i++) {
-    if (mi_arena_from_index(i) == NULL) {
-      arena->id = mi_arena_id_create(i);
+    if (mi_arena_from_index(subproc,i) == NULL) {
       mi_arena_t* expected = NULL;
-      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &mi_arenas[i], &expected, arena)) {
+      if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[i], &expected, arena)) {
         // success
-        if (arena_id != NULL) { *arena_id = arena->id; }
+        if (arena_id != NULL) { *arena_id = arena; }
         return true;
-      }
-      else {
-        arena->id = _mi_arena_id_none();
-      }
+      }      
     }
   }
 
   // otherwise increase the max
-  i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  i = mi_atomic_increment_acq_rel(&subproc->arena_count);
   if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    mi_atomic_decrement_acq_rel(&subproc->arena_count);
+    arena->subproc = NULL;
     return false;
   }
 
   _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
+  mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena; }
   return true;
 }
 
@@ -1099,7 +1070,7 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) {
 }
 
 
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
 {
   mi_assert(!is_large || (memid.initially_committed && memid.is_pinned));
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
@@ -1138,7 +1109,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   }
 
   // init
-  arena->id           = _mi_arena_id_none();
+  arena->subproc      = subproc;
   arena->memid        = memid;
   arena->is_exclusive = exclusive;
   arena->slice_count  = slice_count;
@@ -1176,7 +1147,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+  return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main);
 }
 
 
@@ -1187,7 +1158,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
   memid.initially_committed = is_committed;
   memid.initially_zero = is_zero;
   memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
+  return mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, numa_node, exclusive, memid, arena_id);
 }
 
 // Reserve a range of regular OS memory
@@ -1198,7 +1169,7 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
     _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
@@ -1307,16 +1278,18 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
 }
 
 void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept {
-  size_t max_arenas = mi_arena_get_count();
+  mi_subproc_t* subproc = _mi_subproc();  
+  size_t max_arenas = mi_arenas_get_count(subproc);
   size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
   size_t page_total = 0;
   for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
     if (arena == NULL) break;
+    mi_assert(arena->subproc == subproc);
     slice_total += arena->slice_count;
-    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
+    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc));
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
@@ -1342,7 +1315,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
 ----------------------------------------------------------- */
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
+  if (arena_id != NULL) *arena_id = NULL;
   if (pages==0) return 0;
   if (numa_node < -1) numa_node = -1;
   if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
@@ -1356,7 +1329,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m
   }
   _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
 
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, true, numa_node, exclusive, memid, arena_id)) {
     _mi_os_free(p, hsize, memid);
     return ENOMEM;
   }
@@ -1538,10 +1511,13 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 }
 
 
-static void mi_arenas_try_purge(bool force, bool visit_all) {
+static void mi_arenas_try_purge(bool force, bool visit_all) 
+{
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
-  const size_t max_arena = mi_arena_get_count();
+  mi_tld_t* tld = _mi_tld();
+  mi_subproc_t* subproc = tld->subproc;
+  const size_t max_arena = mi_arenas_get_count(subproc);
   if (max_arena == 0) return;
 
   // allow only one thread to purge at a time
@@ -1549,12 +1525,12 @@ static void mi_arenas_try_purge(bool force, bool visit_all) {
   mi_atomic_guard(&purge_guard)
   {
     const mi_msecs_t now = _mi_clock_now();
-    const size_t arena_start = _mi_tld()->tseq % max_arena;
+    const size_t arena_start = tld->thread_seq % max_arena;
     size_t max_purge_count = (visit_all ? max_arena : 1);
     for (size_t _i = 0; _i < max_arena; _i++) {
       size_t i = _i + arena_start;
       if (i >= max_arena) { i -= max_arena; }
-      mi_arena_t* arena = mi_arena_from_index(i);
+      mi_arena_t* arena = mi_arena_from_index(subproc,i);
       if (arena != NULL) {
         if (mi_arena_try_purge(arena, now, force)) {
           if (max_purge_count <= 1) break;
@@ -1590,13 +1566,7 @@ static bool mi_arena_pages_reregister(mi_arena_t* arena) {
 }
 
 mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) {
-  const size_t count = mi_arena_get_count();
-  const size_t arena_idx = mi_arena_id_index(arena_id);
-  if (count <= arena_idx) {
-    _mi_warning_message("arena id is invalid (%zu)\n", arena_id);
-    return false;
-  }
-  mi_arena_t* arena = mi_arena_from_id(arena_id);
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
   if (arena==NULL) {
     return false;
   }
@@ -1627,10 +1597,17 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   _mi_page_map_unregister_range(arena, asize);
 
   // set the entry to NULL
-  mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL);
-  if (arena_idx + 1 == count) { // try adjust the count?
-    size_t expected = count;
-    mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, count-1);
+  mi_subproc_t* subproc = arena->subproc;
+  const size_t count = mi_arenas_get_count(subproc);
+  for(size_t i = 0; i < count; i++) {
+    if (mi_arena_from_index(subproc, i) == arena) {
+      mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL);
+      if (i + 1 == count) { // try adjust the count?
+        size_t expected = count;
+        mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, count-1);
+      }
+      break;
+    }
   }
   return true;
 }
@@ -1662,8 +1639,8 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
   arena->memid.initially_zero = is_zero;
   arena->is_exclusive = true;
   arena->is_large = is_large;
-  arena->id = _mi_arena_id_none();
-  if (!mi_arena_add(arena, arena_id, &_mi_stats_main)) {
+  arena->subproc = NULL;
+  if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/bitmap.c b/src/bitmap.c
index 6fae1ed6..6352e4ea 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1228,7 +1228,6 @@ bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n,
 
 typedef struct mi_claim_fun_data_s {
   mi_arena_t*   arena;
-  mi_subproc_t* subproc;
   mi_heaptag_t  heap_tag;
 } mi_claim_fun_data_t;
 
@@ -1242,7 +1241,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
     const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
     mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
     bool keep_set = true;
-    if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) {
+    if ((*claim_fun)(slice_index, claim_data->arena, claim_data->heap_tag, &keep_set)) {
       // success!
       mi_assert_internal(!keep_set);
       *pidx = slice_index;
@@ -1267,9 +1266,9 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk
 // Find a set bit in the bitmap and try to atomically clear it and claim it.
 // (Used to find pages in the pages_abandoned bitmaps.)
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-  mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag)
+  mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag)
 {
-  mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag };
+  mi_claim_fun_data_t claim_data = { arena, heap_tag };
   return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data);
 }
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 47c22025..16ecea07 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -208,13 +208,13 @@ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t*
 
 
 // Called once a bit is cleared to see if the memory slice can be claimed.
-typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set);
+typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set);
 
 // Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true.
 // If not claimed, continue on (potentially setting the bit again depending on `keep_set`).
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx,
-                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag );
+                                                    mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag );
 
 
 // Atomically clear a bit but only if it is set. Will block otherwise until the bit is set.
diff --git a/src/free.c b/src/free.c
index 14034593..770856da 100644
--- a/src/free.c
+++ b/src/free.c
@@ -210,7 +210,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   if (mi_page_all_free(page))
   {
     // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish)
-    _mi_arena_page_unabandon(page);
+      _mi_arena_page_unabandon(page);
     // we can free the page directly
     _mi_arena_page_free(page);
     return;
@@ -234,8 +234,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
       mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
       if ((tagheap != NULL) &&                         // don't reclaim across heap object types
           (tagheap->allow_page_reclaim) &&             // we are allowed to reclaim abandoned pages
-          (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-          (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
+          // (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
+          (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
          )
       {
         if (mi_page_queue(tagheap, page->block_size)->first != NULL) {  // don't reclaim for an block_size we don't use
diff --git a/src/heap.c b/src/heap.c
index dee404d2..e8743691 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -178,7 +178,7 @@ mi_heap_t* mi_heap_get_backing(void) {
   mi_assert_internal(heap!=NULL);
   mi_heap_t* bheap = heap->tld->heap_backing;
   mi_assert_internal(bheap!=NULL);
-  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  mi_assert_internal(bheap->tld->thread_id == _mi_thread_id());
   return bheap;
 }
 
@@ -190,8 +190,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->memid = memid;
   heap->tld        = tld;  // avoid reading the thread-local tld during initialization
-  heap->thread_id  = _mi_thread_id();
-  heap->arena_id   = arena_id;
+  heap->exclusive_arena    = _mi_arena_from_id(arena_id);
   heap->allow_page_reclaim = !noreclaim;
   heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0);
   heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
@@ -254,7 +253,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
 }
 
 bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) {
-  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+  return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena);
 }
 
 uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
diff --git a/src/init.c b/src/init.c
index 9a26d56f..a15a9c6c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -33,8 +33,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 },
   #endif
   NULL,       // xheap
-  NULL, NULL, // next, prev
-  NULL,       // subproc
+  NULL, NULL, // next, prev  
   MI_MEMID_STATIC  // memid
 };
 
@@ -96,27 +95,76 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
+static mi_decl_cache_align mi_subproc_t subproc_main;
+
+static mi_decl_cache_align mi_tld_t tld_empty = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,      // subproc
+  NULL,                   // heap_backing
+  NULL,                   // heaps list  
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  NULL,
-  // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free
-  0,                // thread_id
-  0,                // arena_id
-  0,                // cookie
-  { 0, 0 },         // keys
-  { {0}, {0}, 0, true }, // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next
-  MI_MEMID_STATIC,  // memid
-  0,                // full page retain
-  false,            // can reclaim
-  true,             // can eager abandon
-  0,                // tag
+  &tld_empty,         // tld
+  NULL,                   // exclusive_arena
+  0,                      // cookie
+  { 0, 0 },               // keys
+  { {0}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  NULL,                   // next
+  0,                      // full page retain
+  false,                  // can reclaim
+  true,                   // can eager abandon
+  0,                      // tag
   #if MI_GUARDED
-  0, 0, 0, 0, 1,    // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
+  0, 0, 0, 0, 1,          // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`)
   #endif
   MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC
+};
+
+extern mi_heap_t heap_main;
+
+static mi_decl_cache_align mi_tld_t tld_main = {
+  0,                      // thread_id
+  0,                      // thread_seq
+  &subproc_main,      // subproc
+  &heap_main,         // heap_backing
+  &heap_main,         // heaps list  
+  0,                      // heartbeat
+  false,                  // recurse
+  false,                  // is_in_threadpool
+  { MI_STATS_NULL },      // stats
+  MI_MEMID_STATIC         // memid
+};
+
+mi_decl_cache_align mi_heap_t heap_main = {
+  &tld_main,          // thread local data
+  0,                      // initial cookie
+  0,                      // arena id
+  { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                      // page count
+  MI_BIN_FULL, 0,         // page retired min/max
+  NULL,                   // next heap
+  2,                      // full page retain
+  true,                   // allow page reclaim
+  true,                   // allow page abandon
+  0,                      // tag
+  #if MI_GUARDED
+  0, 0, 0, 0, 0,
+  #endif
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_MEMID_STATIC         
 };
 
 
@@ -124,49 +172,9 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
   return _mi_prim_thread_id();
 }
 
-
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 
-extern mi_heap_t _mi_heap_main;
-
-static mi_decl_cache_align mi_subproc_t mi_subproc_default;
-
-static mi_decl_cache_align mi_tld_t tld_main = {
-  0,
-  &_mi_heap_main,         // heap_backing
-  &_mi_heap_main,         // heaps list
-  &mi_subproc_default,    // subproc
-  0,                      // tseq
-  MI_MEMID_STATIC,        // memid
-  false,                  // recurse
-  false,                  // is_in_threadpool
-  { MI_STATS_NULL }       // stats
-};
-
-mi_decl_cache_align mi_heap_t _mi_heap_main = {
-  &tld_main,
-  // MI_ATOMIC_VAR_INIT(NULL),  // thread delayed free list
-  0,                // thread id
-  0,                // initial cookie
-  0,                // arena id
-  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0x846ca68b}, {0}, 0, true },  // random
-  0,                // page count
-  MI_BIN_FULL, 0,   // page retired min/max
-  NULL,             // next heap
-  MI_MEMID_STATIC,  // memid
-  2,                // full page retain
-  true,             // allow page reclaim
-  true,             // allow page abandon
-  0,                // tag
-  #if MI_GUARDED
-  0, 0, 0, 0, 0,
-  #endif
-  MI_SMALL_PAGES_EMPTY,
-  MI_PAGE_QUEUES_EMPTY
-};
-
 bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
 
 mi_stats_t _mi_stats_main = { MI_STATS_NULL };
@@ -210,30 +218,46 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 }
 #endif
 
-
-static void mi_heap_main_init(void) {
-  if (_mi_heap_main.cookie == 0) {
-    _mi_heap_main.thread_id = _mi_thread_id();
-    _mi_heap_main.cookie = 1;
-    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
-      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
-    #else
-      _mi_random_init(&_mi_heap_main.random);
-    #endif
-    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
-    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
-    mi_lock_init(&mi_subproc_default.abandoned_os_lock);
-    mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock);
-    _mi_heap_guarded_init(&_mi_heap_main);
-    _mi_heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
-    _mi_heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+// Initialize main subproc
+static void mi_subproc_main_init(void) {
+  if (subproc_main.memid.memkind != MI_MEM_STATIC) {
+    subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
+    mi_lock_init(&subproc_main.os_pages_lock);
+    mi_lock_init(&subproc_main.arena_reserve_lock);
   }
 }
 
-mi_heap_t* _mi_heap_main_get(void) {
+// Initialize main tld
+static void mi_tld_main_init(void) {
+  if (tld_main.thread_id == 0) {
+    tld_main.thread_id = _mi_prim_thread_id();
+  }
+}
+
+// Initialization of the (statically allocated) main heap, and the main tld and subproc.
+static void mi_heap_main_init(void) {
+  if (heap_main.cookie == 0) {   
+    mi_subproc_main_init();
+    mi_tld_main_init();
+    // heap
+    heap_main.cookie = 1;
+    #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&heap_main.random);
+    #endif
+    heap_main.cookie  = _mi_heap_random_next(&heap_main);
+    heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    heap_main.keys[1] = _mi_heap_random_next(&heap_main);    
+    _mi_heap_guarded_init(&heap_main);
+    heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
+    heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
+  }
+}
+
+mi_heap_t* heap_main_get(void) {
   mi_heap_main_init();
-  return &_mi_heap_main;
+  return &heap_main;
 }
 
 
@@ -265,8 +289,9 @@ static mi_tld_t* mi_tld_alloc(void) {
     tld->memid = memid;
     tld->heap_backing = NULL;
     tld->heaps = NULL;
-    tld->subproc = &mi_subproc_default;
-    tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->subproc = &subproc_main;
+    tld->thread_id = _mi_prim_thread_id();
+    tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1);
     tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
     return tld;
   }
@@ -291,12 +316,24 @@ mi_decl_noinline mi_tld_t* _mi_tld(void) {
   return mi_tld;
 }
 
+mi_subproc_t* _mi_subproc(void) {
+  if (_mi_is_main_thread()) {  // during initialization we should not recurse over reading the _mi_tld
+    return &subproc_main;  
+  }
+  else {
+    return _mi_tld()->subproc;
+  }
+}
 
 
 /* -----------------------------------------------------------
   Sub process
 ----------------------------------------------------------- */
 
+mi_subproc_t* _mi_subproc_main(void) {
+  return &subproc_main;
+}
+
 mi_subproc_id_t mi_subproc_main(void) {
   return NULL;
 }
@@ -305,42 +342,41 @@ mi_subproc_id_t mi_subproc_new(void) {
   mi_memid_t memid;
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
   if (subproc == NULL) return NULL;
-  subproc->abandoned_os_list = NULL;
   subproc->memid = memid;
-  mi_lock_init(&subproc->abandoned_os_lock);
-  mi_lock_init(&subproc->abandoned_os_visit_lock);
+  mi_lock_init(&subproc->os_pages_lock);
+  mi_lock_init(&subproc->arena_reserve_lock);
   return subproc;
 }
 
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) {
-  return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id);
+  return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id);
 }
 
 void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   if (subproc_id == NULL) return;
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
-  // check if there are no abandoned segments still..
+  // check if there are os pages still..
   bool safe_to_delete = false;
-  if (mi_lock_acquire(&subproc->abandoned_os_lock)) {
-    if (subproc->abandoned_os_list == NULL) {
+  if (mi_lock_acquire(&subproc->os_pages_lock)) {
+    if (subproc->os_pages.first == NULL) {
       safe_to_delete = true;
     }
-    mi_lock_release(&subproc->abandoned_os_lock);
+    mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
   // safe to release
   // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->abandoned_os_lock);
-  mi_lock_done(&subproc->abandoned_os_visit_lock);
+  mi_lock_done(&subproc->os_pages_lock);
+  mi_lock_done(&subproc->arena_reserve_lock);
   _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_heap_t* heap = mi_heap_get_default();
-  if (heap == NULL) return;
-  mi_assert(heap->tld->subproc == &mi_subproc_default);
-  if (heap->tld->subproc != &mi_subproc_default) return;
-  heap->tld->subproc = _mi_subproc_from_id(subproc_id);
+  mi_tld_t* tld = _mi_tld();
+  if (tld == NULL) return;
+  mi_assert(tld->subproc == &subproc_main);
+  if (tld->subproc != &subproc_main) return;
+  tld->subproc = _mi_subproc_from_id(subproc_id);
 }
 
 
@@ -352,10 +388,10 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
 static bool _mi_thread_heap_init(void) {
   if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true;
   if (_mi_is_main_thread()) {
-    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // mi_assert_internal(heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
     // the main heap is statically allocated
     mi_heap_main_init();
-    _mi_heap_set_default_direct(&_mi_heap_main);
+    _mi_heap_set_default_direct(&heap_main);
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
@@ -383,7 +419,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   if (!mi_heap_is_initialized(heap)) return true;
 
   // reset default heap
-  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty);
 
   // switch to backing heap
   heap = heap->tld->heap_backing;
@@ -403,7 +439,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   mi_assert_internal(mi_heap_is_backing(heap));
 
   // collect if not the main thread
-  if (heap != &_mi_heap_main) {
+  if (heap != &heap_main) {
     _mi_heap_collect_abandon(heap);
   }
 
@@ -413,12 +449,12 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
   // free heap meta data
   _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
 
-  if (heap == &_mi_heap_main) {
+  if (heap == &heap_main) {
     #if 0
     // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
     // there may still be delete/free calls after the mi_fls_done is called. Issue #207
     _mi_heap_destroy_pages(heap);
-    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    mi_assert_internal(heap->tld->heap_backing == &heap_main);
     #endif
   }
 
@@ -449,12 +485,12 @@ static void mi_process_setup_auto_thread_done(void) {
   if (tls_initialized) return;
   tls_initialized = true;
   _mi_prim_thread_init_auto_done();
-  _mi_heap_set_default_direct(&_mi_heap_main);
+  _mi_heap_set_default_direct(&heap_main);
 }
 
 
 bool _mi_is_main_thread(void) {
-  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+  return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
 static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
@@ -501,7 +537,7 @@ void _mi_thread_done(mi_heap_t* heap)
   _mi_stat_decrease(&_mi_stats_main.threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
-  if (heap->thread_id != _mi_thread_id()) return;
+  if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
   _mi_thread_heap_done(heap);  // returns true if already ran
@@ -560,7 +596,7 @@ void _mi_process_load(void) {
   }
 
   // reseed random
-  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+  _mi_random_reinit_if_weak(&heap_main.random);
 }
 
 #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
@@ -587,7 +623,7 @@ void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
   static mi_atomic_once_t process_init;
 	#if _MSC_VER < 1920
-	mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main
+	mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main
 	#endif
   if (!mi_atomic_once(&process_init)) return;
   _mi_process_is_initialized = true;
@@ -595,10 +631,11 @@ void mi_process_init(void) mi_attr_noexcept {
   mi_process_setup_auto_thread_done();
 
   mi_detect_cpu_features();
+  mi_subproc_main_init();
+  mi_tld_main_init();
+  mi_heap_main_init();
   _mi_os_init();
   _mi_page_map_init();
-  _mi_arena_init();
-  mi_heap_main_init();
   #if MI_DEBUG
   _mi_verbose_message("debug level : %d\n", MI_DEBUG);
   #endif
@@ -609,7 +646,7 @@ void mi_process_init(void) mi_attr_noexcept {
   #endif
   mi_thread_init();
 
-  #if defined(_WIN32)
+  #if defined(_WIN32) && defined(MI_WIN_USE_FLS)
   // On windows, when building as a static lib the FLS cleanup happens to early for the main thread.
   // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
   // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
@@ -670,7 +707,7 @@ void mi_cdecl _mi_process_done(void) {
     mi_stats_print(NULL);
   }
   _mi_allocator_done();
-  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id);
   os_preloading = true; // don't call the C runtime anymore
 }
 
diff --git a/src/page.c b/src/page.c
index d97537d1..0444b47e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -591,7 +591,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
 void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
   mi_page_set_heap(page, heap);
-  page->subproc = heap->tld->subproc;
+  
   size_t page_size;
   uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start);
   mi_track_mem_noaccess(page_start,page_size);

From daac75af3611710b9631434a25fbe9f30fd11414 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 20 Dec 2024 22:13:58 -0800
Subject: [PATCH 05/16] fix lock recursion

---
 ide/vs2022/mimalloc-test-stress.vcxproj |  4 +-
 include/mimalloc/atomic.h               | 27 +++++++++++--
 src/arena.c                             | 15 ++++++--
 src/init.c                              | 51 +++++++++++++------------
 4 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index fd88cd8e..672cbb87 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
+    <ProjectReference Include="mimalloc-override.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ddb5a9a3..ab1e161d 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -408,9 +408,8 @@ static inline void mi_atomic_yield(void) {
 
 // ----------------------------------------------------------------------
 // Locks 
-// These do not have to be recursive and should be light-weight 
-// in-process only locks. Only used for reserving arena's and to 
-// maintain the abandoned list.
+// These should be light-weight in-process only locks. 
+// Only used for reserving arena's and to maintain the abandoned list.
 // ----------------------------------------------------------------------
 #if _MSC_VER
 #pragma warning(disable:26110)  // unlock with holding lock
@@ -418,6 +417,26 @@ static inline void mi_atomic_yield(void) {
 
 #if defined(_WIN32)
 
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+}
+static inline bool mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  return true;
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+}
+
+#if 0
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -436,7 +455,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
-
+#endif
 
 #elif defined(MI_USE_PTHREADS)
 
diff --git a/src/arena.c b/src/arena.c
index bb846da9..fd914f43 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -275,6 +275,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 }
 
 
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
+
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
@@ -325,7 +327,7 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   const bool adjust = (overcommit && arena_commit);
   if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
-  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+  int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
     if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
@@ -1162,14 +1164,14 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is
 }
 
 // Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) {
   if (arena_id != NULL) *arena_id = _mi_arena_id_none();
   size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice
   mi_memid_t memid;
   void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid);
   if (start == NULL) return ENOMEM;
   const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+  if (!mi_manage_os_memory_ex2(subproc, start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
     _mi_os_free_ex(start, size, commit, memid);
     _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
     return ENOMEM;
@@ -1180,6 +1182,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc
   return 0;
 }
 
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex2(_mi_subproc(), size, commit, allow_large, exclusive, arena_id);
+}
+
 // Manage a range of regular OS memory
 bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
   return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
@@ -1289,7 +1296,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed)
     if (arena == NULL) break;
     mi_assert(arena->subproc == subproc);
     slice_total += arena->slice_count;
-    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc));
+    _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc);
     if (show_inuse) {
       free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL);
     }
diff --git a/src/init.c b/src/init.c
index a15a9c6c..177ca2bd 100644
--- a/src/init.c
+++ b/src/init.c
@@ -11,30 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy, memset
 #include <stdlib.h>  // atexit
 
-#define MI_MEMID_STATIC  {{{NULL,0}}, MI_MEM_STATIC, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_INIT(kind)   {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ }
+#define MI_MEMID_STATIC       MI_MEMID_INIT(MI_MEM_STATIC)
 
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
-  MI_ATOMIC_VAR_INIT(0), // xthread_id
-  NULL,    // free
-  0,       // used
-  0,       // capacity
-  0,       // reserved capacity
-  0,       // block size shift
-  0,       // retire_expire
-  NULL,    // local_free
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
-  MI_ATOMIC_VAR_INIT(0), // xflags
-  0,       // block_size
-  NULL,    // page_start
-  0,       // heap tag
-  false,   // is_zero
+  MI_ATOMIC_VAR_INIT(0),  // xthread_id
+  NULL,                   // free
+  0,                      // used
+  0,                      // capacity
+  0,                      // reserved capacity
+  0,                      // block size shift
+  0,                      // retire_expire
+  NULL,                   // local_free
+  MI_ATOMIC_VAR_INIT(0),  // xthread_free
+  MI_ATOMIC_VAR_INIT(0),  // xflags
+  0,                      // block_size
+  NULL,                   // page_start
+  0,                      // heap tag
+  false,                  // is_zero
   #if (MI_PADDING || MI_ENCODE_FREELIST)
-  { 0, 0 },
+  { 0, 0 },               // keys
   #endif
-  NULL,       // xheap
-  NULL, NULL, // next, prev  
-  MI_MEMID_STATIC  // memid
+  NULL,                   // xheap
+  NULL, NULL,             // next, prev  
+  MI_MEMID_STATIC         // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
@@ -100,7 +101,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main;
 static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_id
   0,                      // thread_seq
-  &subproc_main,      // subproc
+  &subproc_main,          // subproc
   NULL,                   // heap_backing
   NULL,                   // heaps list  
   0,                      // heartbeat
@@ -111,7 +112,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
 };
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
-  &tld_empty,         // tld
+  &tld_empty,             // tld
   NULL,                   // exclusive_arena
   0,                      // cookie
   { 0, 0 },               // keys
@@ -136,9 +137,9 @@ extern mi_heap_t heap_main;
 static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_id
   0,                      // thread_seq
-  &subproc_main,      // subproc
-  &heap_main,         // heap_backing
-  &heap_main,         // heaps list  
+  &subproc_main,          // subproc
+  &heap_main,             // heap_backing
+  &heap_main,             // heaps list  
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -147,7 +148,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
 };
 
 mi_decl_cache_align mi_heap_t heap_main = {
-  &tld_main,          // thread local data
+  &tld_main,              // thread local data
   0,                      // initial cookie
   0,                      // arena id
   { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)

From dece8a587b5cb8642c28e0aa40c850da9c30ceb4 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 10:43:08 -0800
Subject: [PATCH 06/16] make stats part of a subproc

---
 ide/vs2022/mimalloc-test-stress.vcxproj |   4 +-
 include/mimalloc/atomic.h               |   6 +-
 include/mimalloc/internal.h             |   1 -
 include/mimalloc/types.h                | 126 ++++++++++++++--------
 src/alloc-aligned.c                     |   4 +-
 src/arena.c                             |  51 +++++----
 src/bitmap.c                            |   4 +-
 src/free.c                              |   2 +-
 src/heap.c                              |  20 ++--
 src/init.c                              |  89 +++++++++-------
 src/os.c                                |  30 +++---
 src/page.c                              |  12 +--
 src/stats.c                             | 136 +++++++++++++-----------
 test/test-stress.c                      |   8 +-
 14 files changed, 274 insertions(+), 219 deletions(-)

diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 672cbb87..fd88cd8e 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ab1e161d..0c7fafe3 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -417,6 +417,8 @@ static inline void mi_atomic_yield(void) {
 
 #if defined(_WIN32)
 
+#if 0
+
 #define mi_lock_t  CRITICAL_SECTION
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -436,7 +438,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
   DeleteCriticalSection(lock);
 }
 
-#if 0
+#else
+
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -455,6 +458,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
+
 #endif
 
 #elif defined(MI_USE_PTHREADS)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 24792f8c..7774b378 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -90,7 +90,6 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 void        _mi_process_load(void);
 void mi_cdecl _mi_process_done(void);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 4d43e887..ca3913ad 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -293,7 +293,7 @@ typedef struct mi_page_s {
   uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
-  mi_heap_t*                heap;              // heap this threads belong to.
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
   mi_memid_t                memid;             // provenance of the page memory
@@ -394,7 +394,7 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;                                 // thread-local data
-  mi_arena_t*           exclusive_arena;                     // if the heap belongs to a specific arena (or NULL)
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -444,18 +444,18 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 
 typedef struct mi_stats_s {
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t purged;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t giant;
-  mi_stat_count_t malloc;
+  mi_stat_count_t   pages;
+  mi_stat_count_t   reserved;
+  mi_stat_count_t   committed;
+  mi_stat_count_t   reset;
+  mi_stat_count_t   purged;
+  mi_stat_count_t   page_committed;
+  mi_stat_count_t   pages_abandoned;
+  mi_stat_count_t   threads;
+  mi_stat_count_t   normal;
+  mi_stat_count_t   huge;
+  mi_stat_count_t   giant;
+  mi_stat_count_t   malloc;
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t pages_reclaim_on_alloc;
   mi_stat_counter_t pages_reclaim_on_free;
@@ -479,37 +479,72 @@ typedef struct mi_stats_s {
 
 
 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 
 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amnt,b)  _mi_stat_adjust_increase( &(stat), amnt, b)
-#define mi_stat_adjust_decrease(stat,amnt,b)  _mi_stat_adjust_decrease( &(stat), amnt, b)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
+#define mi_debug_stat_adjust_increase_mt(stat,amnt,b)           __mi_stat_adjust_increase_mt( &(stat), amnt, b)
+#define mi_debug_stat_adjust_decrease_mt(stat,amnt,b)           __mi_stat_adjust_decrease_mt( &(stat), amnt, b)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
-#define mi_stat_adjuct_increase(stat,amnt,b)  ((void)0)
-#define mi_stat_adjust_decrease(stat,amnt,b)  ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
+#define mi_debug_stat_adjust_increase(stat,amnt,b)              ((void)0)
+#define mi_debug_stat_adjust_decrease(stat,amnt,b)              ((void)0)
 #endif
 
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+
+#define mi_debug_tld_stat_counter_increase(tld,stat,amount)     mi_debug_stat_counter_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_increase(tld,stat,amount)             mi_debug_stat_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_decrease(tld,stat,amount)             mi_debug_stat_decrease( (tld)->stats.stat, amount)
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( (heap)->tld, stat, amount)
+
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_tld_stat_decrease( (heap)->tld, stat, amount)
 
 
 // ------------------------------------------------------
 // Sub processes use separate arena's and no heaps/pages/blocks
 // are shared between sub processes. 
-// Each thread should also belong to one sub-process only
+// The subprocess structure contains essentially all static variables (except per subprocess :-))
+// 
+// Each thread should belong to one sub-process only
 // ------------------------------------------------------
 
 #define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
@@ -519,10 +554,13 @@ typedef struct mi_subproc_s {
   _Atomic(size_t)       arena_count;                    // current count of arena's
   _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
   mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
-  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process  
   mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
   mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  
   mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
 } mi_subproc_t;
 
 
@@ -535,16 +573,16 @@ typedef int64_t  mi_msecs_t;
 
 // Thread local data
 struct mi_tld_s {
-  mi_threadid_t       thread_id;        // thread id of this thread
-  size_t              thread_seq;       // thread sequence id (linear count of created threads)
-  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
-  mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
-  unsigned long long  heartbeat;        // monotonic heartbeat count
-  bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
-  bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
-  mi_stats_t          stats;            // statistics
-  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
+  unsigned long long    heartbeat;            // monotonic heartbeat count
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_stats_t            stats;                // statistics
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };
 
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 14cbee45..5da9fc0c 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -193,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
       const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
       if mi_likely(is_aligned)
       {
-        #if MI_STAT>1
-        mi_heap_stat_increase(heap, malloc, size);
-        #endif
+        mi_debug_heap_stat_increase(heap, malloc, size);
         void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
         mi_assert_internal(p != NULL);
         mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
diff --git a/src/arena.c b/src/arena.c
index fd914f43..dcff8920 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -69,10 +69,6 @@ typedef struct mi_purge_info_s {
   Arena id's
 ----------------------------------------------------------- */
 
-static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) {
-  return arena;
-}
-
 mi_arena_id_t _mi_arena_id_none(void) {
   return NULL;
 }
@@ -222,14 +218,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
       // adjust the stats so we don't double count the commits
       if (already_committed_count > 0) {
-        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
+        mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
       }
       // now actually commit
       bool commit_zero = false;
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
         // failed to commit (todo: give warning?)
         if (already_committed_count > 0) {
-          _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+          mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count));
         }
         memid->initially_committed = false;
       }
@@ -251,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       // if the OS has overcommit, and this is the first time we access these pages, then 
       // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
       if (_mi_os_has_overcommit() && touched_slices > 0) {
-        _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices));
+        mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices));
       }
     }
     // tool support
@@ -325,18 +321,18 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
   // is actually allocated for the first time it will be counted.
   const bool adjust = (overcommit && arena_commit);
-  if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
+  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
   int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
-    if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
+    if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
     const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
-    if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); }
+    if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true); }
     if (arena_reserve > small_arena_reserve) {
       // try again
       err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
-      if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back      
+      if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back      
     }
   }
   return (err==0);
@@ -579,8 +575,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
       mi_assert_internal(mi_page_is_abandoned(page));
       mi_assert_internal(mi_arena_has_page(arena,page));
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-      _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
-      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+      mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1);
+      mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1);
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
@@ -828,12 +824,13 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
     mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
-  }
-  _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
+  }  
   _mi_page_unown(page);
 }
 
@@ -850,8 +847,9 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
     return false;
   }
   else {
-    _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
-    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
+    mi_subproc_t* subproc = _mi_subproc();
+    mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1);
+    mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
     _mi_arena_page_abandon(page);
     return true;
   }
@@ -879,13 +877,14 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1);
   }
   else {
-    // page is full (or a singleton), page is OS/nly allocated
+    // page is full (or a singleton), page is OS allocated
     // nothing to do
     // TODO: maintain count of these as well?
-  }
-  _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+  }  
 }
 
 void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
@@ -1016,7 +1015,7 @@ void _mi_arena_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = NULL; }
@@ -1043,7 +1042,7 @@ static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t
     return false;
   }
 
-  _mi_stat_counter_increase(&stats->arena_count,1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1);
   mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena);
   if (arena_id != NULL) { *arena_id = arena; }
   return true;
@@ -1149,7 +1148,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main);
+  return mi_arena_add(subproc, arena, arena_id);
 }
 
 
@@ -1414,7 +1413,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 
   // update committed bitmap
   if (needs_recommit) {
-    _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
+    mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
   return needs_recommit;
@@ -1506,7 +1505,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) {
     mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend
   }
-  _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
 
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   // this also clears those ranges atomically (so any newly freed blocks will get purged next
@@ -1647,7 +1646,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
   arena->is_exclusive = true;
   arena->is_large = is_large;
   arena->subproc = NULL;
-  if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) {
+  if (!mi_arena_add(_mi_subproc(), arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/bitmap.c b/src/bitmap.c
index 6352e4ea..e4a4cc2d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -106,7 +106,9 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
   do {
     if mi_unlikely((old&mask) == 0) {
       old = mi_atomic_load_acquire(b);
-      if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
+      if ((old&mask)==0) { 
+        mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1); 
+      }
       while ((old&mask)==0) { // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
diff --git a/src/free.c b/src/free.c
index 770856da..88f784c7 100644
--- a/src/free.c
+++ b/src/free.c
@@ -242,7 +242,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
           _mi_arena_page_unabandon(page);
           _mi_heap_page_reclaim(tagheap, page);
-          _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
           return;
         }
       }
diff --git a/src/heap.c b/src/heap.c
index e8743691..d82b383f 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -141,7 +141,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  
+
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
@@ -183,9 +183,9 @@ mi_heap_t* mi_heap_get_backing(void) {
 }
 
 // todo: make order of parameters consistent (but would that break compat with CPython?)
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) 
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
 {
-  mi_assert_internal(heap!=NULL);  
+  mi_assert_internal(heap!=NULL);
   mi_memid_t memid = heap->memid;
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->memid = memid;
@@ -204,7 +204,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
       heap->full_page_retain = heap->full_page_retain / 4;
     }
   }
-  
+
   if (heap->tld->heap_backing == NULL) {
     heap->tld->heap_backing = heap;  // first heap becomes the backing heap
     _mi_random_init(&heap->random);
@@ -240,7 +240,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_assert_internal(bheap != NULL);
-  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);  
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@@ -333,17 +333,17 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, huge, bsize);
   }
-#if (MI_STAT)
+  #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
   const size_t inuse = page->used;
   if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
   }
   mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif
 
   /// pretend it is all free now
   mi_assert_internal(mi_page_thread_free(page) == NULL);
@@ -460,7 +460,7 @@ void mi_heap_delete(mi_heap_t* heap)
     // transfer still used pages to the backing heap
     mi_heap_absorb(bheap, heap);
   }
-  else 
+  else
   */
   {
     // abandon all pages
diff --git a/src/init.c b/src/init.c
index 177ca2bd..5159941a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 },               // keys
   #endif
   NULL,                   // xheap
-  NULL, NULL,             // next, prev  
+  NULL, NULL,             // next, prev
   MI_MEMID_STATIC         // memid
 };
 
@@ -103,7 +103,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   NULL,                   // heap_backing
-  NULL,                   // heaps list  
+  NULL,                   // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -139,7 +139,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   &heap_main,             // heap_backing
-  &heap_main,             // heaps list  
+  &heap_main,             // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -165,7 +165,7 @@ mi_decl_cache_align mi_heap_t heap_main = {
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  MI_MEMID_STATIC         
+  MI_MEMID_STATIC
 };
 
 
@@ -237,7 +237,7 @@ static void mi_tld_main_init(void) {
 
 // Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
-  if (heap_main.cookie == 0) {   
+  if (heap_main.cookie == 0) {
     mi_subproc_main_init();
     mi_tld_main_init();
     // heap
@@ -249,7 +249,7 @@ static void mi_heap_main_init(void) {
     #endif
     heap_main.cookie  = _mi_heap_random_next(&heap_main);
     heap_main.keys[0] = _mi_heap_random_next(&heap_main);
-    heap_main.keys[1] = _mi_heap_random_next(&heap_main);    
+    heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
@@ -266,14 +266,21 @@ mi_heap_t* heap_main_get(void) {
   Thread local data
 ----------------------------------------------------------- */
 
-// Thread sequence number
-static _Atomic(size_t) mi_tcount;
+// Count current and total created threads
+static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
+static _Atomic(size_t)  thread_total_count;
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
 
 // The mimalloc thread local data
-mi_decl_thread mi_tld_t* mi_tld;
+mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
 
 // Allocate fresh tld
 static mi_tld_t* mi_tld_alloc(void) {
+  mi_atomic_increment_relaxed(&thread_count);
   if (_mi_is_main_thread()) {
     return &tld_main;
   }
@@ -292,7 +299,7 @@ static mi_tld_t* mi_tld_alloc(void) {
     tld->heaps = NULL;
     tld->subproc = &subproc_main;
     tld->thread_id = _mi_prim_thread_id();
-    tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
     tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
     return tld;
   }
@@ -301,28 +308,38 @@ static mi_tld_t* mi_tld_alloc(void) {
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
 mi_decl_noinline static void mi_tld_free(void) {
-  mi_tld_t* tld = _mi_tld();
-  mi_tld = MI_TLD_INVALID;
-  _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  mi_tld_t* tld = _mi_tld();  
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    _mi_stats_done(&tld->stats);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  }
+  tld = MI_TLD_INVALID;
+  mi_atomic_decrement_relaxed(&thread_count);
 }
 
 mi_decl_noinline mi_tld_t* _mi_tld(void) {
-  if (mi_tld == MI_TLD_INVALID) {
-    _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n");
-    mi_tld = NULL;
+  mi_tld_t* tld = thread_tld;
+  if (tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
+    thread_tld = &tld_empty;
   }
-  if (mi_tld==NULL) {
-    mi_tld = mi_tld_alloc();
+  if (tld==&tld_empty) {
+    thread_tld = tld = mi_tld_alloc();
   }  
-  return mi_tld;
+  return tld;
 }
 
 mi_subproc_t* _mi_subproc(void) {
-  if (_mi_is_main_thread()) {  // during initialization we should not recurse over reading the _mi_tld
-    return &subproc_main;  
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL || heap == &_mi_heap_empty) {
+    return _mi_subproc_main();
   }
   else {
-    return _mi_tld()->subproc;
+    return thread_tld->subproc;  // don't call `_mi_tld()`
   }
 }
 
@@ -396,11 +413,11 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // allocates tld data 
-    // note: we cannot access thread-locals yet as that can cause (recursive) allocation 
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
     // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
-    mi_tld_t* tld = mi_tld_alloc();  
-    
+    mi_tld_t* tld = mi_tld_alloc();
+
     // allocate and initialize the heap
     mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
 
@@ -409,7 +426,7 @@ static bool _mi_thread_heap_init(void) {
     _mi_heap_set_default_direct(heap);
 
     // now that the heap is set for this thread, we can set the thread-local tld.
-    mi_tld = tld;
+    thread_tld = tld;
   }
   return false;
 }
@@ -444,9 +461,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
     _mi_heap_collect_abandon(heap);
   }
 
-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
-
   // free heap meta data
   _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
 
@@ -494,11 +508,6 @@ bool _mi_is_main_thread(void) {
   return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}
 
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@@ -511,8 +520,7 @@ void mi_thread_init(void) mi_attr_noexcept
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
   if (_mi_thread_heap_init()) return;  // returns true if already initialized
 
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);  
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
@@ -534,15 +542,14 @@ void _mi_thread_done(mi_heap_t* heap)
   }
 
   // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
   _mi_thread_heap_done(heap);  // returns true if already ran
-
+  
   // free thread local data
   mi_tld_free();
 }
@@ -654,7 +661,7 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_prim_thread_associate_default_heap(NULL);
   #endif
 
-  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)  
   mi_track_init();
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
diff --git a/src/os.c b/src/os.c
index 86ecb16b..53e8f571 100644
--- a/src/os.c
+++ b/src/os.c
@@ -114,9 +114,9 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
   if (still_committed) {
-    _mi_stat_decrease(&os_stats->committed, size);
+    mi_os_stat_decrease(committed, size);
   }
-  _mi_stat_decrease(&os_stats->reserved, size);
+  mi_os_stat_decrease(reserved, size);
 }
 
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
@@ -171,11 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  _mi_stat_counter_increase(&os_stats->mmap_calls, 1);
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&os_stats->reserved, size);
+    mi_os_stat_increase(reserved, size);
     if (commit) {
-      _mi_stat_increase(&os_stats->committed, size);
+      mi_os_stat_increase(committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -379,8 +379,8 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&os_stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&os_stats->commit_calls, 1);
+  mi_os_stat_increase(committed, size);  // use size for precise commit vs. decommit
+  mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -408,7 +408,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
 
 static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&os_stats->committed, size);
+  mi_os_stat_decrease(committed, size);
 
   // page align
   size_t csize;
@@ -440,8 +440,8 @@ bool _mi_os_reset(void* addr, size_t size) {
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&os_stats->reset, csize);
-  _mi_stat_counter_increase(&os_stats->reset_calls, 1);
+  mi_os_stat_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -460,8 +460,8 @@ bool _mi_os_reset(void* addr, size_t size) {
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&os_stats->purge_calls, 1);
-  _mi_stat_increase(&os_stats->purged, size);
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_increase(purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
     !_mi_preloading())                                     // don't decommit during preloading (unsafe)
@@ -595,8 +595,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
diff --git a/src/page.c b/src/page.c
index 0444b47e..31dbcc7d 100644
--- a/src/page.c
+++ b/src/page.c
@@ -387,9 +387,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   const size_t bsize = mi_page_block_size(page);
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
+      mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
       mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@@ -554,7 +554,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   size_t page_size;
   //uint8_t* page_start =
   mi_page_area(page, &page_size);
-  mi_heap_stat_counter_increase(heap, pages_extended, 1);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
@@ -583,7 +583,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_heap_stat_increase(heap, page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
@@ -709,8 +709,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     page = next;
   } // for each page
 
-  mi_heap_stat_counter_increase(heap, searches, count);
-
+  mi_debug_heap_stat_counter_increase(heap, searches, count);
+  
   // set the page to the best candidate
   if (page_candidate != NULL) {
     page = page_candidate;
diff --git a/src/stats.c b/src/stats.c
index bb17b936..2a395ed5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -19,88 +19,93 @@ terms of the MIT license. A copy of the license can be found in the file
   Statistics operations
 ----------------------------------------------------------- */
 
-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+  }
+  else {
+    mi_atomic_addi64_relaxed(&stat->freed, -amount);
+  }
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) stat->peak = stat->current;
+  if (amount > 0) {
+    stat->allocated += amount;
   }
   else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
+    stat->freed += -amount;
   }
 }
 
+
 // Adjust stats to compensate; for example before committing a range,
 // first adjust downwards with parts that were already committed so 
 // we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
+  if (amount == 0) return;
+  // adjust atomically 
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+}
+
 static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+  stat->current += amount;
+  if (on_alloc) {
+    stat->allocated += amount;
   }
   else {
-    // don't affect the peak
-    stat->current += amount;    
-    if (on_alloc) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += amount;
-    }
+    stat->freed += amount;
   }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->count++;
-    stat->total += amount;
-  }
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->count, 1);
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
 }
 
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->count++;
+  stat->total += amount;  
+}
+
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, (int64_t)amount);
 }
 
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }
 
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc);
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
 }
 
+
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
@@ -401,27 +406,29 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
 
 static mi_msecs_t mi_process_start; // = 0
 
-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
+// return thread local stats
+static mi_stats_t* mi_get_tld_stats(void) {
+  return &_mi_tld()->stats;
 }
 
 static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) {
+    mi_stats_add(&subproc->stats, stats);
+    _mi_memzero(stats, sizeof(mi_stats_t));
   }
 }
 
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_stats_t* stats = mi_get_tld_stats();
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
+  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
   if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
 void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+  mi_stats_merge_from( mi_get_tld_stats() );
 }
 
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
@@ -429,8 +436,8 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_stats_merge_from(mi_get_tld_stats());
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 
 void mi_stats_print(void* out) mi_attr_noexcept {
@@ -439,7 +446,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }
 
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }
 
 
@@ -473,11 +480,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc();
   mi_process_info_t pinfo;
   _mi_memzero_var(pinfo);
   pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
   pinfo.current_rss    = pinfo.current_commit;
   pinfo.peak_rss       = pinfo.peak_commit;
   pinfo.utime          = 0;
diff --git a/test/test-stress.c b/test/test-stress.c
index b35743df..0920a02e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -48,10 +48,10 @@ static int ITER    = 20;
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
-#elif 0
-static int THREADS = 64;
-static int SCALE = 400;
-static int ITER = 10;
+#elif 1
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
 #define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors

From 95aeda4cdda2431c20ed9fa3facb241b142ae773 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 10:53:34 -0800
Subject: [PATCH 07/16] merge subproc stats on delete

---
 include/mimalloc/internal.h |  1 +
 src/init.c                  |  4 ++++
 src/stats.c                 | 23 +++++++++++------------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 7774b378..e316de94 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -203,6 +203,7 @@ void        _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page);
 
 // "stats.c"
 void        _mi_stats_done(mi_stats_t* stats);
+void        _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from);
 mi_msecs_t  _mi_clock_now(void);
 mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
diff --git a/src/init.c b/src/init.c
index 5159941a..3af4f4ef 100644
--- a/src/init.c
+++ b/src/init.c
@@ -382,6 +382,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
     mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
+
+  // merge stats back into the main subproc?
+  _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats);
+
   // safe to release
   // todo: should we refcount subprocesses?
   mi_lock_done(&subproc->os_pages_lock);
diff --git a/src/stats.c b/src/stats.c
index 2a395ed5..102373ec 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -411,14 +411,6 @@ static mi_stats_t* mi_get_tld_stats(void) {
   return &_mi_tld()->stats;
 }
 
-static void mi_stats_merge_from(mi_stats_t* stats) {
-  mi_subproc_t* subproc = _mi_subproc();
-  if (stats != &subproc->stats) {
-    mi_stats_add(&subproc->stats, stats);
-    _mi_memzero(stats, sizeof(mi_stats_t));
-  }
-}
-
 void mi_stats_reset(void) mi_attr_noexcept {
   mi_stats_t* stats = mi_get_tld_stats();
   mi_subproc_t* subproc = _mi_subproc();
@@ -427,16 +419,23 @@ void mi_stats_reset(void) mi_attr_noexcept {
   if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
-void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_get_tld_stats() );
+void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) {
+  if (to != from) {
+    mi_stats_add(to, from);
+    _mi_memzero(from, sizeof(mi_stats_t));
+  }
 }
 
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
-  mi_stats_merge_from(stats);
+  _mi_stats_merge_from(&_mi_subproc()->stats, stats);
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  _mi_stats_done( mi_get_tld_stats() );
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_get_tld_stats());
+  mi_stats_merge();
   _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 

From 4ad7fedd25e0869aa6fbca2aa24fe08dd4eebc39 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 11:35:30 -0800
Subject: [PATCH 08/16] track os abandoned pages in a list

---
 include/mimalloc/atomic.h | 25 ++++++++---------
 include/mimalloc/types.h  |  4 +--
 src/arena-meta.c          |  7 +++--
 src/arena.c               | 56 ++++++++++++++++++++++++++-------------
 src/init.c                | 11 ++++----
 5 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 0c7fafe3..fcd9efba 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -415,6 +415,8 @@ static inline void mi_atomic_yield(void) {
 #pragma warning(disable:26110)  // unlock with holding lock
 #endif
 
+#define mi_lock(lock)    for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+
 #if defined(_WIN32)
 
 #if 0
@@ -424,9 +426,8 @@ static inline void mi_atomic_yield(void) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryEnterCriticalSection(lock);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   EnterCriticalSection(lock);
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   LeaveCriticalSection(lock);
@@ -445,9 +446,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryAcquireSRWLockExclusive(lock);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   AcquireSRWLockExclusive(lock);
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   ReleaseSRWLockExclusive(lock);
@@ -468,8 +468,11 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return (pthread_mutex_trylock(lock) == 0);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(lock);
+  if (err != 0) {
+    mi_error_message(EFAULT, "internal error: lock cannot be acquired\n");
+  }
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   pthread_mutex_unlock(lock);
@@ -489,9 +492,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return lock->try_lock();
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   lock->lock();
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   lock->unlock();
@@ -514,12 +516,11 @@ static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
-    if (mi_lock_try_acquire(lock)) return true;
+    if (mi_lock_try_acquire(lock)) return;
     mi_atomic_yield();
-  }
-  return true;
+  }  
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   mi_atomic_store_release(lock, (uintptr_t)0);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index ca3913ad..59393848 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -556,8 +556,8 @@ typedef struct mi_subproc_s {
   mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
 
   _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process  
-  mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
-  mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  mi_page_t*            os_abandoned_pages;             // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
+  mi_lock_t             os_abandoned_pages_lock;        // lock for the os abandoned pages list (this lock protects list operations)
   
   mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
   mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
diff --git a/src/arena-meta.c b/src/arena-meta.c
index f28c50e9..a5dc8e75 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -64,12 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
 // allocate a fresh meta page and add it to the global list.
 static mi_meta_page_t* mi_meta_page_zalloc(void) {
   // allocate a fresh arena slice
-  // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the 
-  // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet).
+  // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
   mi_memid_t memid;
-  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
+  mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0,
                                                                    true /* commit*/, true /* allow large */,
-                                                                   NULL, 0 /* tseq */, &memid );
+                                                                   NULL /* req arena */, 0 /* thread_seq */, &memid);
   if (mpage == NULL) return NULL;
   mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN));
   if (!memid.initially_zero) {
diff --git a/src/arena.c b/src/arena.c
index dcff8920..c4b02cf6 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -439,24 +439,20 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
 
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
   const size_t arena_count = mi_arenas_get_count(subproc);
-  if (mi_lock_acquire(&subproc->arena_reserve_lock)) {
-    bool ok = true;
+  mi_lock(&subproc->arena_reserve_lock) {    
     if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
     }
     else {
       // another thread already reserved a new arena
     }
-    mi_lock_release(&subproc->arena_reserve_lock);
-    if (ok) {
-      // try once more to allocate in the new arena
-      mi_assert_internal(req_arena == NULL);
-      p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
-      if (p != NULL) return p;
-    }
-  }
+  }  
+  // try once more to allocate in the new arena
+  mi_assert_internal(req_arena == NULL);
+  p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
+  if (p != NULL) return p;    
 
   return NULL;
 }
@@ -685,11 +681,13 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_
   else {
     page->block_size_shift = 0;
   }
+  // and own it
+  mi_page_try_claim_ownership(page);
+
+  // register in the page map
   _mi_page_map_register(page);
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
-
-  mi_page_try_claim_ownership(page);
   mi_assert_internal(mi_page_block_size(page) == block_size);
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(mi_page_is_owned(page));
@@ -771,7 +769,8 @@ void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_ptr_page(page)==page);
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_all_free(page));
-  mi_assert_internal(page->next==NULL);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  mi_assert_internal(page->next==NULL && page->prev==NULL);
 
   #if MI_DEBUG>1
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
@@ -790,6 +789,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   }
   #endif
 
+  // unregister page
   _mi_page_map_unregister(page);
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
@@ -807,7 +807,7 @@ void _mi_arena_page_abandon(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
   mi_assert_internal(!mi_page_all_free(page));
-  mi_assert_internal(page->next==NULL);
+  mi_assert_internal(page->next==NULL && page->prev == NULL);
 
   if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) {
     // make available for allocations
@@ -827,8 +827,19 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
   }
   else {
-    // page is full (or a singleton), page is OS/externally allocated
+    // page is full (or a singleton), or the page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
+    mi_subproc_t* subproc = _mi_subproc();
+    // but for non-arena pages, add to the subproc list so these can be visited
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&subproc->os_abandoned_pages_lock) {
+        // push in front
+        page->prev = NULL;
+        page->next = subproc->os_abandoned_pages;
+        if (page->next != NULL) { page->next->prev = page; }
+        subproc->os_abandoned_pages = page;
+      }
+    }
     mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
   }  
   _mi_page_unown(page);
@@ -881,9 +892,18 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
   }
   else {
     // page is full (or a singleton), page is OS allocated
-    // nothing to do
-    // TODO: maintain count of these as well?
+    mi_subproc_t* subproc = _mi_subproc();
     mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+    // if not an arena page, remove from the subproc os pages list
+    if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) {
+      mi_lock(&subproc->os_abandoned_pages_lock) {
+        if (page->prev != NULL) { page->prev->next = page->next; }
+        if (page->next != NULL) { page->next->prev = page->prev; }
+        if (subproc->os_abandoned_pages == page) { subproc->os_abandoned_pages = page->next; }
+        page->next = NULL;
+        page->prev = NULL;
+      }
+    }
   }  
 }
 
diff --git a/src/init.c b/src/init.c
index 3af4f4ef..1968ef68 100644
--- a/src/init.c
+++ b/src/init.c
@@ -223,7 +223,7 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
 static void mi_subproc_main_init(void) {
   if (subproc_main.memid.memkind != MI_MEM_STATIC) {
     subproc_main.memid = _mi_memid_create(MI_MEM_STATIC);
-    mi_lock_init(&subproc_main.os_pages_lock);
+    mi_lock_init(&subproc_main.os_abandoned_pages_lock);
     mi_lock_init(&subproc_main.arena_reserve_lock);
   }
 }
@@ -361,7 +361,7 @@ mi_subproc_id_t mi_subproc_new(void) {
   mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid);
   if (subproc == NULL) return NULL;
   subproc->memid = memid;
-  mi_lock_init(&subproc->os_pages_lock);
+  mi_lock_init(&subproc->os_abandoned_pages_lock);
   mi_lock_init(&subproc->arena_reserve_lock);
   return subproc;
 }
@@ -375,11 +375,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
   // check if there are os pages still..
   bool safe_to_delete = false;
-  if (mi_lock_acquire(&subproc->os_pages_lock)) {
-    if (subproc->os_pages.first == NULL) {
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    if (subproc->os_abandoned_pages == NULL) {
       safe_to_delete = true;
     }
-    mi_lock_release(&subproc->os_pages_lock);
   }
   if (!safe_to_delete) return;
 
@@ -388,7 +387,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
 
   // safe to release
   // todo: should we refcount subprocesses?
-  mi_lock_done(&subproc->os_pages_lock);
+  mi_lock_done(&subproc->os_abandoned_pages_lock);
   mi_lock_done(&subproc->arena_reserve_lock);
   _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid);
 }

From 89b0d5a357af02809509544f83c92e7f5be11a3f Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 11:53:29 -0800
Subject: [PATCH 09/16] allocate heaps associated with an arena in that arena

---
 include/mimalloc/internal.h | 11 ++++++-----
 include/mimalloc/types.h    | 21 ++++++---------------
 src/arena-meta.c            |  5 +----
 src/arena.c                 |  6 ++----
 src/heap.c                  | 14 +++++++++++---
 src/init.c                  | 10 +++++-----
 6 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e316de94..208989e3 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -147,6 +147,7 @@ mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
 
 void*       _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void*       _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
+void        _mi_arena_free(void* p, size_t size, mi_memid_t memid);
 bool        _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
 bool        _mi_arena_contains(const void* p);
 void        _mi_arenas_collect(bool force_purge);
@@ -421,11 +422,11 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
   return (heap != &_mi_heap_empty);
 }
 
-static inline uintptr_t _mi_ptr_cookie(const void* p) {
-  extern mi_heap_t _mi_heap_main;
-  mi_assert_internal(_mi_heap_main.cookie != 0);
-  return ((uintptr_t)p ^ _mi_heap_main.cookie);
-}
+//static inline uintptr_t _mi_ptr_cookie(const void* p) {
+//  extern mi_heap_t _mi_heap_main;
+//  mi_assert_internal(_mi_heap_main.cookie != 0);
+//  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+//}
 
 
 /* -----------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 59393848..461b5393 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -396,7 +396,6 @@ struct mi_heap_s {
   mi_tld_t*             tld;                                 // thread-local data
   mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
-  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
   size_t                page_count;                          // total number of pages in the `pages` queues.
   size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
@@ -522,21 +521,13 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 #define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
 #define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
 
-#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
-#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
-#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+#define mi_heap_stat_counter_increase(heap,stat,amount)         __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 __mi_stat_increase( &(heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 __mi_stat_decrease( &(heap)->tld->stats.stat, amount)
 
-#define mi_debug_tld_stat_counter_increase(tld,stat,amount)     mi_debug_stat_counter_increase( (tld)->stats.stat, amount)
-#define mi_debug_tld_stat_increase(tld,stat,amount)             mi_debug_stat_increase( (tld)->stats.stat, amount)
-#define mi_debug_tld_stat_decrease(tld,stat,amount)             mi_debug_stat_decrease( (tld)->stats.stat, amount)
-
-#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase((heap)->tld, stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( (heap)->tld, stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( (heap)->tld, stat, amount)
-
-#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount)
-#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_tld_stat_increase( (heap)->tld, stat, amount)
-#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_tld_stat_decrease( (heap)->tld, stat, amount)
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_stat_decrease( (heap)->tld->stats.stat, amount)
 
 
 // ------------------------------------------------------
diff --git a/src/arena-meta.c b/src/arena-meta.c
index a5dc8e75..065a1331 100644
--- a/src/arena-meta.c
+++ b/src/arena-meta.c
@@ -148,11 +148,8 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) {
     _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE);
     mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL);
   }
-  else if (mi_memid_is_os(memid)) {
-    _mi_os_free(p, size, memid);    
-  }
   else {
-    mi_assert_internal(mi_memid_needs_no_free(memid));
+    _mi_arena_free(p,size,memid);
   }
 }
 
diff --git a/src/arena.c b/src/arena.c
index c4b02cf6..869cba49 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -762,8 +762,6 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block
   return page;
 }
 
-static void mi_arena_free(void* p, size_t size, mi_memid_t memid);
-
 void _mi_arena_page_free(mi_page_t* page) {
   mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
   mi_assert_internal(_mi_ptr_page(page)==page);
@@ -794,7 +792,7 @@ void _mi_arena_page_free(mi_page_t* page) {
   if (page->memid.memkind == MI_MEM_ARENA) {
     mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index);
   }
-  mi_arena_free(page, mi_memid_size(page->memid), page->memid);
+  _mi_arena_free(page, mi_memid_size(page->memid), page->memid);
 }
 
 /* -----------------------------------------------------------
@@ -920,7 +918,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
 static void mi_arenas_try_purge(bool force, bool visit_all);
 
-static void mi_arena_free(void* p, size_t size, mi_memid_t memid) {
+void _mi_arena_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
   if (size==0) return;
 
diff --git a/src/heap.c b/src/heap.c
index d82b383f..f47aaad9 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -213,8 +213,8 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
     _mi_random_split(&heap->tld->heap_backing->random, &heap->random);
   }
   heap->cookie  = _mi_heap_random_next(heap) | 1;
-  heap->keys[0] = _mi_heap_random_next(heap);
-  heap->keys[1] = _mi_heap_random_next(heap);
+  //heap->keys[0] = _mi_heap_random_next(heap);
+  //heap->keys[1] = _mi_heap_random_next(heap);*/
   _mi_heap_guarded_init(heap);
 
   // push on the thread local heaps list
@@ -227,7 +227,15 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   mi_assert(heap_tag >= 0 && heap_tag < 256);
   // allocate and initialize a heap
   mi_memid_t memid;
-  mi_heap_t* heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  mi_heap_t* heap; 
+  if (arena_id == _mi_arena_id_none()) {
+    heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
+  }
+  else {
+    // heaps associated wita a specific arena are allocated in that arena
+    // note: takes up at least one slice which is quite wasteful...
+    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+  }
   if (heap==NULL) {
     _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
     return NULL;
diff --git a/src/init.c b/src/init.c
index 1968ef68..2f147e55 100644
--- a/src/init.c
+++ b/src/init.c
@@ -115,7 +115,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   &tld_empty,             // tld
   NULL,                   // exclusive_arena
   0,                      // cookie
-  { 0, 0 },               // keys
+  //{ 0, 0 },               // keys
   { {0}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
@@ -149,9 +149,9 @@ static mi_decl_cache_align mi_tld_t tld_main = {
 
 mi_decl_cache_align mi_heap_t heap_main = {
   &tld_main,              // thread local data
+  NULL,                   // exclusive arena
   0,                      // initial cookie
-  0,                      // arena id
-  { 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  //{ 0, 0 },               // the key of the main heap can be fixed (unlike page keys that need to be secure!)
   { {0x846ca68b}, {0}, 0, true },  // random
   0,                      // page count
   MI_BIN_FULL, 0,         // page retired min/max
@@ -248,8 +248,8 @@ static void mi_heap_main_init(void) {
       _mi_random_init(&heap_main.random);
     #endif
     heap_main.cookie  = _mi_heap_random_next(&heap_main);
-    heap_main.keys[0] = _mi_heap_random_next(&heap_main);
-    heap_main.keys[1] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[0] = _mi_heap_random_next(&heap_main);
+    //heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);

From 7d46478a5f7c16b078b7955df95d3801eb1d585d Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 13:19:06 -0800
Subject: [PATCH 10/16] add initial load/unload for heaps

---
 include/mimalloc.h |  8 ++++-
 src/arena.c        | 22 +++++++-----
 src/heap.c         | 83 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/include/mimalloc.h b/include/mimalloc.h
index 7a58e54c..b0a20e9e 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -326,7 +326,13 @@ mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min,
 //mi_decl_export void  mi_os_decommit(void* p, size_t size);
 
 mi_decl_export bool  mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
-mi_decl_export bool  mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id); 
+mi_decl_export bool  mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id); 
+mi_decl_export bool  mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);
+mi_decl_export void  mi_heap_unload(mi_heap_t* heap);
+
+// Is a pointer contained in the given arena area?
+mi_decl_export bool  mi_arena_contains(mi_arena_id_t arena_id, const void* p);
+
 
 // ------------------------------------------------------
 // Convenience
diff --git a/src/arena.c b/src/arena.c
index 869cba49..aa3c9175 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -492,7 +492,6 @@ void* _mi_arena_alloc_aligned( mi_subproc_t* subproc,
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) &&           // is arena allocation allowed?
-      req_arena == NULL &&                                               // not a specific arena?
       size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
       alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0)            // and good alignment
   {
@@ -980,13 +979,21 @@ void _mi_arenas_collect(bool force_purge) {
   mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
 }
 
+
+// Is a pointer contained in the given arena area?
+bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) {
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  return (mi_arena_start(arena) <= (const uint8_t*)p &&
+          mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p);
+}
+
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
   mi_subproc_t* subproc = _mi_subproc();
   const size_t max_arena = mi_arenas_get_count(subproc);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) {
+    if (arena != NULL && mi_arena_contains(arena,p)) {
       return true;
     }
   }
@@ -1636,7 +1643,7 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t*
   return true;
 }
 
-mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) {
+mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id) {
   // assume the memory area is already containing the arena
   if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
   if (start == NULL || size == 0) return false;
@@ -1658,13 +1665,10 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
     _mi_warning_message("the reloaded arena is not exclusive\n");
     return false;
   }
-  arena->memid.is_pinned = is_large;
-  arena->memid.initially_committed = is_committed;
-  arena->memid.initially_zero = is_zero;
+  
   arena->is_exclusive = true;
-  arena->is_large = is_large;
-  arena->subproc = NULL;
-  if (!mi_arena_add(_mi_subproc(), arena, arena_id)) {
+  arena->subproc = _mi_subproc();
+  if (!mi_arena_add(arena->subproc, arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/heap.c b/src/heap.c
index f47aaad9..03030b47 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -234,7 +234,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   else {
     // heaps associated wita a specific arena are allocated in that arena
     // note: takes up at least one slice which is quite wasteful...
-    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
+    heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
   }
   if (heap==NULL) {
     _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
@@ -280,7 +280,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
 }
 
 // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
-static void mi_heap_free(mi_heap_t* heap) {
+static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) {
   mi_assert(heap != NULL);
   mi_assert_internal(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
@@ -307,7 +307,9 @@ static void mi_heap_free(mi_heap_t* heap) {
   mi_assert_internal(heap->tld->heaps != NULL);
 
   // and free the used memory
-  _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  if (do_free_mem) {
+    _mi_meta_free(heap, sizeof(*heap), heap->memid);
+  }
 }
 
 // return a heap on the same thread as `heap` specialized for the specified tag (if it exists)
@@ -403,7 +405,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
     #endif
     // free all pages
     _mi_heap_destroy_pages(heap);
-    mi_heap_free(heap);
+    mi_heap_free(heap,true);
   }
   #endif
 }
@@ -462,20 +464,11 @@ void mi_heap_delete(mi_heap_t* heap)
   mi_assert_expensive(mi_heap_is_valid(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return;
 
-  /*
-  mi_heap_t* bheap = heap->tld->heap_backing;
-  if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) {
-    // transfer still used pages to the backing heap
-    mi_heap_absorb(bheap, heap);
-  }
-  else
-  */
-  {
-    // abandon all pages
-    _mi_heap_collect_abandon(heap);
-  }
+  // abandon all pages
+  _mi_heap_collect_abandon(heap);
+  
   mi_assert_internal(heap->page_count==0);
-  mi_heap_free(heap);
+  mi_heap_free(heap,true);
 }
 
 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
@@ -489,7 +482,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
 }
 
 
+/* -----------------------------------------------------------
+  Load/unload heaps
+----------------------------------------------------------- */
+void mi_heap_unload(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
+    return;
+  }
+  
+  // abandon all pages so all thread'id in the pages are cleared
+  _mi_heap_collect_abandon(heap);
+  mi_assert_internal(heap->page_count==0);
 
+  // remove from heap list
+  mi_heap_free(heap, false /* but don't actually free the memory */);
+
+  // disassociate from the current thread-local and static state
+  heap->tld = NULL;
+  return;
+}
+
+bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
+  mi_assert(mi_heap_is_initialized(heap));  
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (heap->exclusive_arena == NULL) {
+    _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
+    return false;
+  }
+  if (heap->tld != NULL) {
+    _mi_warning_message("cannot reload heaps that were not unloaded first\n");
+    return false;
+  }
+  mi_arena_t* arena = _mi_arena_from_id(arena_id);
+  if (heap->exclusive_arena != arena) {
+    _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena);
+    return false;
+  }
+
+  mi_assert_internal(heap->page_count==0);
+
+  // re-associate from the current thread-local and static state
+  heap->tld = _mi_tld();
+
+  // reinit direct pages (as we may be in a different process)
+  mi_assert_internal(heap->page_count == 0);
+  for (int i = 0; i < MI_PAGES_DIRECT; i++) {
+    heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty;
+  }
+
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return true;
+}
 
 /* -----------------------------------------------------------
   Analysis

From e3ebebb9902c56b6899f70f046cbcc8089674569 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 14:39:17 -0800
Subject: [PATCH 11/16] update lock primitive; fix arena exclusive allocation

---
 include/mimalloc/atomic.h | 31 ++++++++++++++++++++++++++++---
 src/arena-abandon.c       | 33 +++++++++++----------------------
 src/arena.c               |  5 +++--
 src/init.c                | 15 +++++++--------
 4 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 0c967896..733dbf42 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2018-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@@ -411,8 +411,11 @@ static inline void mi_atomic_yield(void) {
 #pragma warning(disable:26110)  // unlock with holding lock
 #endif
 
+#define mi_lock(lock)    for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) )
+
 #if defined(_WIN32)
 
+#if 1
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -432,6 +435,30 @@ static inline void mi_lock_done(mi_lock_t* lock) {
   // nothing
 }
 
+#else
+#define mi_lock_t  CRITICAL_SECTION
+
+static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
+  return TryEnterCriticalSection(lock);
+  
+}
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  EnterCriticalSection(lock);
+  
+}
+static inline void mi_lock_release(mi_lock_t* lock) {
+  LeaveCriticalSection(lock);
+  
+}
+static inline void mi_lock_init(mi_lock_t* lock) {
+  InitializeCriticalSection(lock);
+  
+}
+static inline void mi_lock_done(mi_lock_t* lock) {
+  DeleteCriticalSection(lock);
+  
+}
+#endif
 
 #elif defined(MI_USE_PTHREADS)
 
@@ -506,6 +533,4 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 #endif
 
 
-
-
 #endif // __MIMALLOC_ATOMIC_H
diff --git a/src/arena-abandon.c b/src/arena-abandon.c
index 48e37794..460c80fc 100644
--- a/src/arena-abandon.c
+++ b/src/arena-abandon.c
@@ -120,11 +120,7 @@ static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
   mi_assert(segment->memid.memkind != MI_MEM_ARENA);
   // not in an arena; we use a list of abandoned segments
   mi_subproc_t* const subproc = segment->subproc;
-  if (!mi_lock_acquire(&subproc->abandoned_os_lock)) {
-    _mi_error_message(EFAULT, "internal error: failed to acquire the abandoned (os) segment lock to mark abandonment");
-    // we can continue but cannot visit/reclaim such blocks..
-  }
-  else {
+  mi_lock(&subproc->abandoned_os_lock) {
     // push on the tail of the list (important for the visitor)
     mi_segment_t* prev = subproc->abandoned_os_list_tail;
     mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
@@ -138,7 +134,6 @@ static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
     mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
     mi_atomic_increment_relaxed(&subproc->abandoned_count);
     // and release the lock
-    mi_lock_release(&subproc->abandoned_os_lock);
   }
   return;
 }
@@ -251,7 +246,7 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_
         if mi_unlikely(field != 0) { // skip zero fields quickly
           // we only take the arena lock if there are actually abandoned segments present
           if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
+            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
             if (!has_lock) {
               if (previous->visit_all) {
                 _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
@@ -289,8 +284,8 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_c
   // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
   // The lock is released when the cursor is released.
   if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock)
-      : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
+    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
+                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
     if (!previous->hold_visit_lock) {
       if (previous->visit_all) {
         _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
@@ -301,21 +296,15 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_c
   // One list entry at a time
   while (previous->os_list_count > 0) {
     previous->os_list_count--;
-    const bool has_lock = mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    if (has_lock) {
-      mi_segment_t* segment = previous->subproc->abandoned_os_list;
-      // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-      if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-        mi_lock_release(&previous->subproc->abandoned_os_lock);
-        return segment;
-      }
-      // already abandoned, try again
+    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
+    mi_segment_t* segment = previous->subproc->abandoned_os_list;
+    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
+    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
       mi_lock_release(&previous->subproc->abandoned_os_lock);
+      return segment;
     }
-    else {
-      _mi_error_message(EFAULT, "failed to acquire abandoned OS list lock during abandoned block visit\n");
-      return NULL;
-    }
+    // already abandoned, try again
+    mi_lock_release(&previous->subproc->abandoned_os_lock);
   }
   // done
   mi_assert_internal(previous->os_list_count == 0);
diff --git a/src/arena.c b/src/arena.c
index 164f3116..86ac5955 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -394,8 +394,9 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
   const int numa_node = _mi_os_numa_node(); // current numa node
 
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) 
+    {
       void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
       if (p != NULL) return p;
 
diff --git a/src/init.c b/src/init.c
index 3e4da831..68a1d7e2 100644
--- a/src/init.c
+++ b/src/init.c
@@ -168,8 +168,8 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL };
 #if MI_GUARDED
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
   heap->guarded_sample_seed = seed;
-  if (heap->guarded_sample_seed == 0) { 
-    heap->guarded_sample_seed = _mi_heap_random_next(heap); 
+  if (heap->guarded_sample_seed == 0) {
+    heap->guarded_sample_seed = _mi_heap_random_next(heap);
   }
   heap->guarded_sample_rate  = sample_rate;
   if (heap->guarded_sample_rate >= 1) {
@@ -187,9 +187,9 @@ void _mi_heap_guarded_init(mi_heap_t* heap) {
   mi_heap_guarded_set_sample_rate(heap,
     (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX),
     (size_t)mi_option_get(mi_option_guarded_sample_seed));
-  mi_heap_guarded_set_size_bound(heap, 
+  mi_heap_guarded_set_size_bound(heap,
     (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX),
-    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );  
+    (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) );
 }
 #else
 mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) {
@@ -257,11 +257,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
   mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
   // check if there are no abandoned segments still..
   bool safe_to_delete = false;
-  if (mi_lock_acquire(&subproc->abandoned_os_lock)) {
+  mi_lock(&subproc->abandoned_os_lock) {
     if (subproc->abandoned_os_list == NULL) {
       safe_to_delete = true;
     }
-    mi_lock_release(&subproc->abandoned_os_lock);
   }
   if (!safe_to_delete) return;
   // safe to release
@@ -398,7 +397,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) {
   tld->heap_backing = bheap;
   tld->heaps = NULL;
   tld->segments.subproc = &mi_subproc_default;
-  tld->segments.stats = &tld->stats;  
+  tld->segments.stats = &tld->stats;
 }
 
 // Free the thread local default heap (called from `mi_thread_done`)
@@ -599,7 +598,7 @@ static void mi_detect_cpu_features(void) {
 }
 #else
 static void mi_detect_cpu_features(void) {
-  // nothing 
+  // nothing
 }
 #endif
 

From 108c84e858b7ee2aa2fd3f00de03afb879e89718 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 14:45:14 -0800
Subject: [PATCH 12/16] remove req_arena parameter to arena_reserve

---
 src/arena.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index aa3c9175..af1f737e 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -274,11 +274,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id);
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t* arena_id)
 {
-  // if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
   const size_t arena_count = mi_arenas_get_count(subproc);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -443,7 +440,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
     if (arena_count == mi_arenas_get_count(subproc)) {
       // we are the first to enter the lock, reserve a fresh arena
       mi_arena_id_t arena_id = 0;
-      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id);
+      mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, &arena_id);
     }
     else {
       // another thread already reserved a new arena

From 476d4699ff93380009ae35780c2261ae674e4200 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:24:46 -0800
Subject: [PATCH 13/16] limit purgeing to one purge cycle per purge delay

---
 include/mimalloc/atomic.h | 26 +++++++--------
 src/arena.c               | 69 +++++++++++++++++++++++++--------------
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 733dbf42..c6083102 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -421,9 +421,8 @@ static inline void mi_atomic_yield(void) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryAcquireSRWLockExclusive(lock);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   AcquireSRWLockExclusive(lock);
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   ReleaseSRWLockExclusive(lock);
@@ -432,7 +431,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
   InitializeSRWLock(lock);
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
-  // nothing
+  (void)(lock);
 }
 
 #else
@@ -440,24 +439,20 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return TryEnterCriticalSection(lock);
-  
 }
 static inline void mi_lock_acquire(mi_lock_t* lock) {
   EnterCriticalSection(lock);
-  
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   LeaveCriticalSection(lock);
-  
 }
 static inline void mi_lock_init(mi_lock_t* lock) {
   InitializeCriticalSection(lock);
-  
 }
 static inline void mi_lock_done(mi_lock_t* lock) {
   DeleteCriticalSection(lock);
-  
 }
+
 #endif
 
 #elif defined(MI_USE_PTHREADS)
@@ -467,8 +462,11 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return (pthread_mutex_trylock(lock) == 0);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
-  return (pthread_mutex_lock(lock) == 0);
+static inline void mi_lock_acquire(mi_lock_t* lock) {
+  const int err = pthread_mutex_lock(lock);
+  if (err != 0) {
+    mi_error_message(EFAULT, "internal error: lock cannot be acquired\n");
+  }
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   pthread_mutex_unlock(lock);
@@ -488,9 +486,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   return lock->try_lock();
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   lock->lock();
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   lock->unlock();
@@ -513,12 +510,11 @@ static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
   uintptr_t expected = 0;
   return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1);
 }
-static inline bool mi_lock_acquire(mi_lock_t* lock) {
+static inline void mi_lock_acquire(mi_lock_t* lock) {
   for (int i = 0; i < 1000; i++) {  // for at most 1000 tries?
-    if (mi_lock_try_acquire(lock)) return true;
+    if (mi_lock_try_acquire(lock)) return;
     mi_atomic_yield();
   }
-  return true;
 }
 static inline void mi_lock_release(mi_lock_t* lock) {
   mi_atomic_store_release(lock, (uintptr_t)0);
diff --git a/src/arena.c b/src/arena.c
index 86ac5955..0ddb2936 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -33,7 +33,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 typedef struct mi_arena_s {
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
-  _Atomic(uint8_t*)start;                // the start of the memory area
+  _Atomic(uint8_t*)   start;                // the start of the memory area
   size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
   size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
   size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
@@ -42,12 +42,13 @@ typedef struct mi_arena_s {
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
+  _Atomic(size_t)     search_idx;           // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be purged from `blocks_purge`.
+  
+  mi_bitmap_field_t*  blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t*  blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t*  blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t*  blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
   mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
   // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
@@ -60,6 +61,7 @@ typedef struct mi_arena_s {
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+static mi_decl_cache_align _Atomic(int64_t)     mi_arenas_purge_expire; // set if there exist purgeable arenas
 
 #define MI_IN_ARENA_C
 #include "arena-abandon.c"
@@ -349,11 +351,10 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
 }
 
 // try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id)
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id)
 {
   if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
+  
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
   if (arena_count > (MI_MAX_ARENAS - 4)) return false;
 
@@ -403,7 +404,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
       // otherwise, try to first eagerly reserve a new arena
       if (req_arena_id == _mi_arena_id_none()) {
         mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+        if (mi_arena_reserve(size, allow_large, &arena_id)) {
           // and try allocate in there
           mi_assert_internal(req_arena_id == _mi_arena_id_none());
           p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid);
@@ -497,13 +498,16 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
     mi_arena_purge(arena, bitmap_idx, blocks);
   }
   else {
-    // schedule decommit
-    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-    if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
+    // schedule purge
+    const mi_msecs_t expire = _mi_clock_now() + delay;
+    mi_msecs_t expire0 = 0;
+    if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) {
+      // expiration was not yet set
+      // maybe set the global arenas expire as well (if it wasn't set already)
+      mi_atomic_casi64_strong_acq_rel(&mi_arenas_purge_expire, &expire0, expire);
     }
     else {
-      mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
+      // already an expiration was set
     }
     _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL);
   }
@@ -538,14 +542,16 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
 // returns true if anything was purged
 static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 {
-  if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
+  // check pre-conditions
+  if (arena->memid.is_pinned) return false;
+   
+  // expired yet?
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
-  if (expire == 0) return false;
-  if (!force && expire > now) return false;
+  if (!force && (expire == 0 || expire > now)) return false;
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
-
+  
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
   bool full_purge = true;
@@ -592,9 +598,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   return any_purged;
 }
 
-static void mi_arenas_try_purge( bool force, bool visit_all ) {
+static void mi_arenas_try_purge( bool force, bool visit_all ) 
+{
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
+  // check if any arena needs purging?
+  const mi_msecs_t now = _mi_clock_now();
+  mi_msecs_t arenas_expire = mi_atomic_load_acquire(&mi_arenas_purge_expire);
+  if (!force && (arenas_expire == 0 || arenas_expire < now)) return;
+
   const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
   if (max_arena == 0) return;
 
@@ -602,17 +614,26 @@ static void mi_arenas_try_purge( bool force, bool visit_all ) {
   static mi_atomic_guard_t purge_guard;
   mi_atomic_guard(&purge_guard)
   {
-    mi_msecs_t now = _mi_clock_now();
-    size_t max_purge_count = (visit_all ? max_arena : 1);
+    // increase global expire: at most one purge per delay cycle
+    mi_atomic_store_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay());  
+    size_t max_purge_count = (visit_all ? max_arena : 2);
+    bool all_visited = true;
     for (size_t i = 0; i < max_arena; i++) {
       mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
       if (arena != NULL) {
         if (mi_arena_try_purge(arena, now, force)) {
-          if (max_purge_count <= 1) break;
+          if (max_purge_count <= 1) {
+            all_visited = false;
+            break;
+          }
           max_purge_count--;
         }
       }
     }
+    if (all_visited) {
+      // all arena's were visited and purged: reset global expire
+      mi_atomic_store_release(&mi_arenas_purge_expire, 0);
+    }
   }
 }
 

From 825dd41769bc01984f7db515fe7df597a71547ab Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:29:39 -0800
Subject: [PATCH 14/16] fix build error

---
 include/mimalloc/atomic.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index c6083102..c4fac766 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -457,6 +457,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
 
 #elif defined(MI_USE_PTHREADS)
 
+void _mi_error_message(int err, const char* fmt, ...);
+
 #define mi_lock_t  pthread_mutex_t
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -465,7 +467,7 @@ static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
 static inline void mi_lock_acquire(mi_lock_t* lock) {
   const int err = pthread_mutex_lock(lock);
   if (err != 0) {
-    mi_error_message(EFAULT, "internal error: lock cannot be acquired\n");
+    _mi_error_message(err, "internal error: lock cannot be acquired\n");
   }
 }
 static inline void mi_lock_release(mi_lock_t* lock) {

From 7085b6cec31641fddaca3d40932cda82e91baf07 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:38:27 -0800
Subject: [PATCH 15/16] limit candidate search to 4

---
 src/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/page.c b/src/page.c
index 8808c358..e1c07a93 100644
--- a/src/page.c
+++ b/src/page.c
@@ -732,7 +732,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 -------------------------------------------------------------*/
 
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (8)
+#define MI_MAX_CANDIDATE_SEARCH  (4)
 
 // is the page not yet used up to its reserved space?
 static bool mi_page_is_expandable(const mi_page_t* page) {

From c138fba149d358465345ce0316c42d626afe1328 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sat, 21 Dec 2024 15:49:17 -0800
Subject: [PATCH 16/16] merge from dev

---
 src/arena-abandon.c | 346 --------------------------------------------
 1 file changed, 346 deletions(-)
 delete mode 100644 src/arena-abandon.c

diff --git a/src/arena-abandon.c b/src/arena-abandon.c
deleted file mode 100644
index 460c80fc..00000000
--- a/src/arena-abandon.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-#if !defined(MI_IN_ARENA_C)
-#error "this file should be included from 'arena.c' (so mi_arena_t is visible)"
-// add includes help an IDE
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "bitmap.h"
-#endif
-
-// Minimal exports for arena-abandoned.
-size_t      mi_arena_id_index(mi_arena_id_t id);
-mi_arena_t* mi_arena_from_index(size_t idx);
-size_t      mi_arena_get_count(void);
-void*       mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex);
-bool        mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index);
-
-/* -----------------------------------------------------------
-  Abandoned blocks/segments:
-
-  _mi_arena_segment_clear_abandoned
-  _mi_arena_segment_mark_abandoned
-
-  This is used to atomically abandon/reclaim segments
-  (and crosses the arena API but it is convenient to have here).
-
-  Abandoned segments still have live blocks; they get reclaimed
-  when a thread frees a block in it, or when a thread needs a fresh
-  segment.
-
-  Abandoned segments are atomically marked in the `block_abandoned`
-  bitmap of arenas. Any segments allocated outside arenas are put
-  in the sub-process `abandoned_os_list`. This list is accessed
-  using locks but this should be uncommon and generally uncontended.
-  Reclaim and visiting either scan through the `block_abandoned`
-  bitmaps of the arena's, or visit the `abandoned_os_list`
-
-  A potentially nicer design is to use arena's for everything
-  and perhaps have virtual arena's to map OS allocated memory
-  but this would lack the "density" of our current arena's. TBC.
------------------------------------------------------------ */
-
-
-// reclaim a specific OS abandoned segment; `true` on success.
-// sets the thread_id.
-static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena, remove from list of abandoned os segments
-  mi_subproc_t* const subproc = segment->subproc;
-  if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) {
-    return false;  // failed to acquire the lock, we just give up
-  }
-  // remove atomically from the abandoned os list (if possible!)
-  bool reclaimed = false;
-  mi_segment_t* const next = segment->abandoned_os_next;
-  mi_segment_t* const prev = segment->abandoned_os_prev;
-  if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) {
-    #if MI_DEBUG>3
-    // find ourselves in the abandoned list (and check the count)
-    bool found = false;
-    size_t count = 0;
-    for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) {
-      if (current == segment) { found = true; }
-      count++;
-    }
-    mi_assert_internal(found);
-    mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count));
-    #endif
-    // remove (atomically) from the list and reclaim
-    if (prev != NULL) { prev->abandoned_os_next = next; }
-    else { subproc->abandoned_os_list = next; }
-    if (next != NULL) { next->abandoned_os_prev = prev; }
-    else { subproc->abandoned_os_list_tail = prev; }
-    segment->abandoned_os_next = NULL;
-    segment->abandoned_os_prev = NULL;
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count);
-    if (take_lock) { // don't reset the thread_id when iterating
-      mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-    }
-    reclaimed = true;
-  }
-  if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); }
-  return reclaimed;
-}
-
-// reclaim a specific abandoned segment; `true` on success.
-// sets the thread_id.
-bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) {
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */);
-  }
-  // arena segment: use the blocks_abandoned bitmap.
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // reclaim atomically
-  bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx);
-  if (was_marked) {
-    mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0);
-    mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count);
-    mi_atomic_store_release(&segment->thread_id, _mi_thread_id());
-  }
-  // mi_assert_internal(was_marked);
-  mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-  return was_marked;
-}
-
-
-// mark a specific OS segment as abandoned
-static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) {
-  mi_assert(segment->memid.memkind != MI_MEM_ARENA);
-  // not in an arena; we use a list of abandoned segments
-  mi_subproc_t* const subproc = segment->subproc;
-  mi_lock(&subproc->abandoned_os_lock) {
-    // push on the tail of the list (important for the visitor)
-    mi_segment_t* prev = subproc->abandoned_os_list_tail;
-    mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL);
-    mi_assert_internal(segment->abandoned_os_prev == NULL);
-    mi_assert_internal(segment->abandoned_os_next == NULL);
-    if (prev != NULL) { prev->abandoned_os_next = segment; }
-    else { subproc->abandoned_os_list = segment; }
-    subproc->abandoned_os_list_tail = segment;
-    segment->abandoned_os_prev = prev;
-    segment->abandoned_os_next = NULL;
-    mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count);
-    mi_atomic_increment_relaxed(&subproc->abandoned_count);
-    // and release the lock
-  }
-  return;
-}
-
-// mark a specific segment as abandoned
-// clears the thread_id.
-void _mi_arena_segment_mark_abandoned(mi_segment_t* segment)
-{
-  mi_assert_internal(segment->used == segment->abandoned);
-  mi_atomic_store_release(&segment->thread_id, (uintptr_t)0);  // mark as abandoned for multi-thread free's
-  if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) {
-    mi_arena_segment_os_mark_abandoned(segment);
-    return;
-  }
-  // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap
-  size_t arena_idx;
-  size_t bitmap_idx;
-  mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx);
-  mi_arena_t* arena = mi_arena_from_index(arena_idx);
-  mi_assert_internal(arena != NULL);
-  // set abandonment atomically
-  mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned
-  const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-  if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); }
-  mi_assert_internal(was_unmarked);
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-}
-
-
-/* -----------------------------------------------------------
-  Iterate through the abandoned blocks/segments using a cursor.
-  This is used for reclaiming and abandoned block visiting.
------------------------------------------------------------ */
-
-// start a cursor at a randomized arena
-void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) {
-  mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc);
-  current->bitmap_idx = 0;
-  current->subproc = subproc;
-  current->visit_all = visit_all;
-  current->hold_visit_lock = false;
-  const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count);
-  const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count);
-  const size_t max_arena = mi_arena_get_count();
-  if (heap != NULL && heap->arena_id != _mi_arena_id_none()) {
-    // for a heap that is bound to one arena, only visit that arena
-    current->start = mi_arena_id_index(heap->arena_id);
-    current->end = current->start + 1;
-    current->os_list_count = 0;
-  }
-  else {
-    // otherwise visit all starting at a random location
-    if (abandoned_count > abandoned_list_count && max_arena > 0) {
-      current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena));
-      current->end = current->start + max_arena;
-    }
-    else {
-      current->start = 0;
-      current->end = 0;
-    }
-    current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list
-  }
-  mi_assert_internal(current->start <= max_arena);
-}
-
-void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) {
-  if (current->hold_visit_lock) {
-    mi_lock_release(&current->subproc->abandoned_os_visit_lock);
-    current->hold_visit_lock = false;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {
-  // try to reclaim an abandoned segment in the arena atomically
-  if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL;
-  mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx));
-  mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx);
-  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0);
-  // check that the segment belongs to our sub-process
-  // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled.
-  //  without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process.
-  //  for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock.
-  if (segment->subproc != subproc) {
-    // it is from another sub-process, re-mark it and continue searching
-    const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL);
-    mi_assert_internal(was_zero); MI_UNUSED(was_zero);
-    return NULL;
-  }
-  else {
-    // success, we unabandoned a segment in our sub-process
-    mi_atomic_decrement_relaxed(&subproc->abandoned_count);
-    return segment;
-  }
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) {
-  const size_t max_arena = mi_arena_get_count();
-  size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx);
-  size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx);
-  // visit arena's (from the previous cursor)
-  for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) {
-    // index wraps around
-    size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start);
-    mi_arena_t* arena = mi_arena_from_index(arena_idx);
-    if (arena != NULL) {
-      bool has_lock = false;
-      // visit the abandoned fields (starting at previous_idx)
-      for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) {
-        size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]);
-        if mi_unlikely(field != 0) { // skip zero fields quickly
-          // we only take the arena lock if there are actually abandoned segments present
-          if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) {
-            has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock));
-            if (!has_lock) {
-              if (previous->visit_all) {
-                _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock");
-              }
-              // skip to next arena
-              break;
-            }
-          }
-          mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned));
-          // visit each set bit in the field  (todo: maybe use `ctz` here?)
-          for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) {
-            // pre-check if the bit is set
-            size_t mask = ((size_t)1 << bit_idx);
-            if mi_unlikely((field & mask) == mask) {
-              mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx);
-              mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx);
-              if (segment != NULL) {
-                //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx));
-                if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-                previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration
-                return segment;
-              }
-            }
-          }
-        }
-      }
-      if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); }
-    }
-  }
-  return NULL;
-}
-
-static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) {
-  // go through the abandoned_os_list
-  // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`.
-  // The lock is released when the cursor is released.
-  if (!previous->hold_visit_lock) {
-    previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true)
-                                                     : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock));
-    if (!previous->hold_visit_lock) {
-      if (previous->visit_all) {
-        _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock");
-      }
-      return NULL; // we cannot get the lock, give up
-    }
-  }
-  // One list entry at a time
-  while (previous->os_list_count > 0) {
-    previous->os_list_count--;
-    mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free`
-    mi_segment_t* segment = previous->subproc->abandoned_os_list;
-    // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries)
-    if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) {
-      mi_lock_release(&previous->subproc->abandoned_os_lock);
-      return segment;
-    }
-    // already abandoned, try again
-    mi_lock_release(&previous->subproc->abandoned_os_lock);
-  }
-  // done
-  mi_assert_internal(previous->os_list_count == 0);
-  return NULL;
-}
-
-
-// reclaim abandoned segments
-// this does not set the thread id (so it appears as still abandoned)
-mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) {
-  if (previous->start < previous->end) {
-    // walk the arena
-    mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous);
-    if (segment != NULL) { return segment; }
-  }
-  // no entries in the arena's anymore, walk the abandoned OS list
-  mi_assert_internal(previous->start == previous->end);
-  return mi_arena_segment_clear_abandoned_next_list(previous);
-}
-
-
-bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
-  // (unfortunately) the visit_abandoned option must be enabled from the start.
-  // This is to avoid taking locks if abandoned list visiting is not required (as for most programs)
-  if (!mi_option_is_enabled(mi_option_visit_abandoned)) {
-    _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON");
-    return false;
-  }
-  mi_arena_field_cursor_t current;
-  _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, &current);
-  mi_segment_t* segment;
-  bool ok = true;
-  while (ok && (segment = _mi_arena_segment_clear_abandoned_next(&current)) != NULL) {
-    ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg);
-    _mi_arena_segment_mark_abandoned(segment);
-  }
-  _mi_arena_field_cursor_done(&current);
-  return ok;
-}