diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj
index 672cbb87..fd88cd8e 100644
--- a/ide/vs2022/mimalloc-test-stress.vcxproj
+++ b/ide/vs2022/mimalloc-test-stress.vcxproj
@@ -279,8 +279,8 @@
     </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="mimalloc-override.vcxproj">
-      <Project>{abb5eae7-b3e6-432e-b636-333449892ea7}</Project>
+    <ProjectReference Include="mimalloc.vcxproj">
+      <Project>{abb5eae7-b3e6-432e-b636-333449892ea6}</Project>
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index ab1e161d..0c7fafe3 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -417,6 +417,8 @@ static inline void mi_atomic_yield(void) {
 
 #if defined(_WIN32)
 
+#if 0
+
 #define mi_lock_t  CRITICAL_SECTION
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -436,7 +438,8 @@ static inline void mi_lock_done(mi_lock_t* lock) {
   DeleteCriticalSection(lock);
 }
 
-#if 0
+#else
+
 #define mi_lock_t  SRWLOCK   // slim reader-writer lock
 
 static inline bool mi_lock_try_acquire(mi_lock_t* lock) {
@@ -455,6 +458,7 @@ static inline void mi_lock_init(mi_lock_t* lock) {
 static inline void mi_lock_done(mi_lock_t* lock) {
   (void)(lock);
 }
+
 #endif
 
 #elif defined(MI_USE_PTHREADS)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 24792f8c..7774b378 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -90,7 +90,6 @@ uintptr_t   _mi_os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);
 
 // init.c
-extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
 extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
 void        _mi_process_load(void);
 void mi_cdecl _mi_process_done(void);
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 4d43e887..ca3913ad 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -293,7 +293,7 @@ typedef struct mi_page_s {
   uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
-  mi_heap_t*                heap;              // heap this threads belong to.
+  mi_heap_t*                heap;              // the heap owning this page (or NULL for abandoned pages)
   struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
   mi_memid_t                memid;             // provenance of the page memory
@@ -394,7 +394,7 @@ typedef struct mi_padding_s {
 // A heap owns a set of pages.
 struct mi_heap_s {
   mi_tld_t*             tld;                                 // thread-local data
-  mi_arena_t*           exclusive_arena;                     // if the heap belongs to a specific arena (or NULL)
+  mi_arena_t*           exclusive_arena;                     // if the heap should only allocate from a specific arena (or NULL)
   uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
   uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
   mi_random_ctx_t       random;                              // random number context used for secure allocation
@@ -444,18 +444,18 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 
 typedef struct mi_stats_s {
-  mi_stat_count_t pages;
-  mi_stat_count_t reserved;
-  mi_stat_count_t committed;
-  mi_stat_count_t reset;
-  mi_stat_count_t purged;
-  mi_stat_count_t page_committed;
-  mi_stat_count_t pages_abandoned;
-  mi_stat_count_t threads;
-  mi_stat_count_t normal;
-  mi_stat_count_t huge;
-  mi_stat_count_t giant;
-  mi_stat_count_t malloc;
+  mi_stat_count_t   pages;
+  mi_stat_count_t   reserved;
+  mi_stat_count_t   committed;
+  mi_stat_count_t   reset;
+  mi_stat_count_t   purged;
+  mi_stat_count_t   page_committed;
+  mi_stat_count_t   pages_abandoned;
+  mi_stat_count_t   threads;
+  mi_stat_count_t   normal;
+  mi_stat_count_t   huge;
+  mi_stat_count_t   giant;
+  mi_stat_count_t   malloc;
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t pages_reclaim_on_alloc;
   mi_stat_counter_t pages_reclaim_on_free;
@@ -479,37 +479,72 @@ typedef struct mi_stats_s {
 
 
 // add to stat keeping track of the peak
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount);
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount);
 // adjust stat in special cases to compensate for double counting
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free);
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc);
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free);
 // counters can just be increased
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount);
 
 #if (MI_STAT)
-#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
-#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
-#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
-#define mi_stat_adjust_increase(stat,amnt,b)  _mi_stat_adjust_increase( &(stat), amnt, b)
-#define mi_stat_adjust_decrease(stat,amnt,b)  _mi_stat_adjust_decrease( &(stat), amnt, b)
+#define mi_debug_stat_increase(stat,amount)                     __mi_stat_increase( &(stat), amount)
+#define mi_debug_stat_decrease(stat,amount)                     __mi_stat_decrease( &(stat), amount)
+#define mi_debug_stat_counter_increase(stat,amount)             __mi_stat_counter_increase( &(stat), amount)
+#define mi_debug_stat_increase_mt(stat,amount)                  __mi_stat_increase_mt( &(stat), amount)
+#define mi_debug_stat_decrease_mt(stat,amount)                  __mi_stat_decrease_mt( &(stat), amount)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          __mi_stat_counter_increase_mt( &(stat), amount)
+#define mi_debug_stat_adjust_increase_mt(stat,amnt,b)           __mi_stat_adjust_increase_mt( &(stat), amnt, b)
+#define mi_debug_stat_adjust_decrease_mt(stat,amnt,b)           __mi_stat_adjust_decrease_mt( &(stat), amnt, b)
 #else
-#define mi_stat_increase(stat,amount)         ((void)0)
-#define mi_stat_decrease(stat,amount)         ((void)0)
-#define mi_stat_counter_increase(stat,amount) ((void)0)
-#define mi_stat_adjuct_increase(stat,amnt,b)  ((void)0)
-#define mi_stat_adjust_decrease(stat,amnt,b)  ((void)0)
+#define mi_debug_stat_increase(stat,amount)                     ((void)0)
+#define mi_debug_stat_decrease(stat,amount)                     ((void)0)
+#define mi_debug_stat_counter_increase(stat,amount)             ((void)0)
+#define mi_debug_stat_increase_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_decrease_mt(stat,amount)                  ((void)0)
+#define mi_debug_stat_counter_increase_mt(stat,amount)          ((void)0)
+#define mi_debug_stat_adjust_increase(stat,amnt,b)              ((void)0)
+#define mi_debug_stat_adjust_decrease(stat,amnt,b)              ((void)0)
 #endif
 
-#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
-#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+#define mi_subproc_stat_counter_increase(subproc,stat,amount)   __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_increase(subproc,stat,amount)           __mi_stat_increase_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_decrease(subproc,stat,amount)           __mi_stat_decrease_mt( &(subproc)->stats.stat, amount)
+#define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b)    __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b)
+#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b)    __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b)
+
+#define mi_os_stat_counter_increase(stat,amount)                mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_increase(stat,amount)                        mi_subproc_stat_increase(_mi_subproc(),stat,amount)
+#define mi_os_stat_decrease(stat,amount)                        mi_subproc_stat_decrease(_mi_subproc(),stat,amount)
+
+#define mi_tld_stat_counter_increase(tld,stat,amount)           __mi_stat_counter_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_increase(tld,stat,amount)                   __mi_stat_increase( &(tld)->stats.stat, amount)
+#define mi_tld_stat_decrease(tld,stat,amount)                   __mi_stat_decrease( &(tld)->stats.stat, amount)
+
+#define mi_debug_tld_stat_counter_increase(tld,stat,amount)     mi_debug_stat_counter_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_increase(tld,stat,amount)             mi_debug_stat_increase( (tld)->stats.stat, amount)
+#define mi_debug_tld_stat_decrease(tld,stat,amount)             mi_debug_stat_decrease( (tld)->stats.stat, amount)
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)         mi_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)                 mi_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)                 mi_tld_stat_decrease( (heap)->tld, stat, amount)
+
+#define mi_debug_heap_stat_counter_increase(heap,stat,amount)   mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount)
+#define mi_debug_heap_stat_increase(heap,stat,amount)           mi_debug_tld_stat_increase( (heap)->tld, stat, amount)
+#define mi_debug_heap_stat_decrease(heap,stat,amount)           mi_debug_tld_stat_decrease( (heap)->tld, stat, amount)
 
 
 // ------------------------------------------------------
 // Sub processes use separate arena's and no heaps/pages/blocks
 // are shared between sub processes. 
-// Each thread should also belong to one sub-process only
+// The subprocess structure contains essentially all static variables (except per subprocess :-))
+// 
+// Each thread should belong to one sub-process only
 // ------------------------------------------------------
 
 #define MI_MAX_ARENAS   (160)   // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`)
@@ -519,10 +554,13 @@ typedef struct mi_subproc_s {
   _Atomic(size_t)       arena_count;                    // current count of arena's
   _Atomic(mi_arena_t*)  arenas[MI_MAX_ARENAS];          // arena's of this sub-process
   mi_lock_t             arena_reserve_lock;             // lock to ensure arena's get reserved one at a time
-  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process
+
+  _Atomic(size_t)       abandoned_count[MI_BIN_COUNT];  // total count of abandoned pages for this sub-process  
   mi_page_queue_t       os_pages;                       // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on)
   mi_lock_t             os_pages_lock;                  // lock for the os pages list (this lock protects list operations)
+  
   mi_memid_t            memid;                          // provenance of this memory block (meta or OS)
+  mi_stats_t            stats;                          // sub-process statistics (tld stats are merged in on thread termination)
 } mi_subproc_t;
 
 
@@ -535,16 +573,16 @@ typedef int64_t  mi_msecs_t;
 
 // Thread local data
 struct mi_tld_s {
-  mi_threadid_t       thread_id;        // thread id of this thread
-  size_t              thread_seq;       // thread sequence id (linear count of created threads)
-  mi_subproc_t*       subproc;          // sub-process this thread belongs to.
-  mi_heap_t*          heap_backing;     // backing heap of this thread (cannot be deleted)
-  mi_heap_t*          heaps;            // list of heaps in this thread (so we can abandon all when the thread terminates)
-  unsigned long long  heartbeat;        // monotonic heartbeat count
-  bool                recurse;          // true if deferred was called; used to prevent infinite recursion.
-  bool                is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks)
-  mi_stats_t          stats;            // statistics
-  mi_memid_t          memid;            // provenance of the tld memory itself (meta or OS)
+  mi_threadid_t         thread_id;            // thread id of this thread
+  size_t                thread_seq;           // thread sequence id (linear count of created threads)
+  mi_subproc_t*         subproc;              // sub-process this thread belongs to.
+  mi_heap_t*            heap_backing;         // backing heap of this thread (cannot be deleted)
+  mi_heap_t*            heaps;                // list of heaps in this thread (so we can abandon all when the thread terminates)
+  unsigned long long    heartbeat;            // monotonic heartbeat count
+  bool                  recurse;              // true if deferred was called; used to prevent infinite recursion.
+  bool                  is_in_threadpool;     // true if this thread is part of a threadpool (and can run arbitrary tasks)
+  mi_stats_t            stats;                // statistics
+  mi_memid_t            memid;                // provenance of the tld memory itself (meta or OS)
 };
 
 
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 14cbee45..5da9fc0c 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -193,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t
       const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0;
       if mi_likely(is_aligned)
       {
-        #if MI_STAT>1
-        mi_heap_stat_increase(heap, malloc, size);
-        #endif
+        mi_debug_heap_stat_increase(heap, malloc, size);
         void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen
         mi_assert_internal(p != NULL);
         mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
diff --git a/src/arena.c b/src/arena.c
index fd914f43..dcff8920 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -69,10 +69,6 @@ typedef struct mi_purge_info_s {
   Arena id's
 ----------------------------------------------------------- */
 
-static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) {
-  return arena;
-}
-
 mi_arena_id_t _mi_arena_id_none(void) {
   return NULL;
 }
@@ -222,14 +218,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
       // adjust the stats so we don't double count the commits
       if (already_committed_count > 0) {
-        _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
+        mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */);
       }
       // now actually commit
       bool commit_zero = false;
       if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) {
         // failed to commit (todo: give warning?)
         if (already_committed_count > 0) {
-          _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
+          mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count));
         }
         memid->initially_committed = false;
       }
@@ -251,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
       // if the OS has overcommit, and this is the first time we access these pages, then 
       // count the commit now (as at arena reserve we didn't count those commits as these are on-demand)
       if (_mi_os_has_overcommit() && touched_slices > 0) {
-        _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices));
+        mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices));
       }
     }
     // tool support
@@ -325,18 +321,18 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
   // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice
   // is actually allocated for the first time it will be counted.
   const bool adjust = (overcommit && arena_commit);
-  if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); }
+  if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); }
   // and try to reserve the arena
   int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
   if (err != 0) {
-    if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back
+    if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back
     // failed, try a smaller size?
     const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
-    if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); }
+    if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true); }
     if (arena_reserve > small_arena_reserve) {
       // try again
       err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
-      if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back      
+      if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back      
     }
   }
   return (err==0);
@@ -579,8 +575,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t
       mi_assert_internal(mi_page_is_abandoned(page));
       mi_assert_internal(mi_arena_has_page(arena,page));
       mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-      _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
-      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+      mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1);
+      mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1);
 
       _mi_page_free_collect(page, false);  // update `used` count
       mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
@@ -828,12 +824,13 @@ void _mi_arena_page_abandon(mi_page_t* page) {
     const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index);
     MI_UNUSED(wasclear); mi_assert_internal(wasclear);
     mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1);
   }
   else {
     // page is full (or a singleton), page is OS/externally allocated
     // leave as is; it will be reclaimed when an object is free'd in the page
-  }
-  _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1);
+  }  
   _mi_page_unown(page);
 }
 
@@ -850,8 +847,9 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) {
     return false;
   }
   else {
-    _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1);
-    _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
+    mi_subproc_t* subproc = _mi_subproc();
+    mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1);
+    mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */);  // adjust as we are not abandoning fresh
     _mi_arena_page_abandon(page);
     return true;
   }
@@ -879,13 +877,14 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]);
+    mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1);
   }
   else {
-    // page is full (or a singleton), page is OS/nly allocated
+    // page is full (or a singleton), page is OS allocated
     // nothing to do
     // TODO: maintain count of these as well?
-  }
-  _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+    mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1);
+  }  
 }
 
 void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) {
@@ -1016,7 +1015,7 @@ void _mi_arena_unsafe_destroy_all(void) {
   Add an arena.
 ----------------------------------------------------------- */
 
-static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) {
   mi_assert_internal(arena != NULL);
   mi_assert_internal(arena->slice_count > 0);
   if (arena_id != NULL) { *arena_id = NULL; }
@@ -1043,7 +1042,7 @@ static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t
     return false;
   }
 
-  _mi_stat_counter_increase(&stats->arena_count,1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1);
   mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena);
   if (arena_id != NULL) { *arena_id = arena; }
   return true;
@@ -1149,7 +1148,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
     mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL);
   }
 
-  return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main);
+  return mi_arena_add(subproc, arena, arena_id);
 }
 
 
@@ -1414,7 +1413,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c
 
   // update committed bitmap
   if (needs_recommit) {
-    _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
+    mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */);
     mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count);
   }
   return needs_recommit;
@@ -1506,7 +1505,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
   if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) {
     mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend
   }
-  _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1);
+  mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1);
 
   // go through all purge info's  (with max MI_BFIELD_BITS ranges at a time)
   // this also clears those ranges atomically (so any newly freed blocks will get purged next
@@ -1647,7 +1646,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed,
   arena->is_exclusive = true;
   arena->is_large = is_large;
   arena->subproc = NULL;
-  if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) {
+  if (!mi_arena_add(_mi_subproc(), arena, arena_id)) {
     return false;
   }
   mi_arena_pages_reregister(arena);
diff --git a/src/bitmap.c b/src/bitmap.c
index 6352e4ea..e4a4cc2d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -106,7 +106,9 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_
   do {
     if mi_unlikely((old&mask) == 0) {
       old = mi_atomic_load_acquire(b);
-      if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); }
+      if ((old&mask)==0) { 
+        mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1); 
+      }
       while ((old&mask)==0) { // busy wait
         mi_atomic_yield();
         old = mi_atomic_load_acquire(b);
diff --git a/src/free.c b/src/free.c
index 770856da..88f784c7 100644
--- a/src/free.c
+++ b/src/free.c
@@ -242,7 +242,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
           // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
           _mi_arena_page_unabandon(page);
           _mi_heap_page_reclaim(tagheap, page);
-          _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
+          mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1);
           return;
         }
       }
diff --git a/src/heap.c b/src/heap.c
index e8743691..d82b383f 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -141,7 +141,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 
   // collect all pages owned by this thread
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  
+
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
   _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
 }
@@ -183,9 +183,9 @@ mi_heap_t* mi_heap_get_backing(void) {
 }
 
 // todo: make order of parameters consistent (but would that break compat with CPython?)
-void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) 
+void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld)
 {
-  mi_assert_internal(heap!=NULL);  
+  mi_assert_internal(heap!=NULL);
   mi_memid_t memid = heap->memid;
   _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
   heap->memid = memid;
@@ -204,7 +204,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint
       heap->full_page_retain = heap->full_page_retain / 4;
     }
   }
-  
+
   if (heap->tld->heap_backing == NULL) {
     heap->tld->heap_backing = heap;  // first heap becomes the backing heap
     _mi_random_init(&heap->random);
@@ -240,7 +240,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
 mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) {
   mi_heap_t* bheap = mi_heap_get_backing();
   mi_assert_internal(bheap != NULL);
-  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);  
+  return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld);
 }
 
 mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) {
@@ -333,17 +333,17 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
   if (bsize > MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, huge, bsize);
   }
-#if (MI_STAT)
+  #if (MI_STAT)
   _mi_page_free_collect(page, false);  // update used count
   const size_t inuse = page->used;
   if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize * inuse);
-#if (MI_STAT>1)
+    #if (MI_STAT>1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
-#endif
+    #endif
   }
   mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
-#endif
+  #endif
 
   /// pretend it is all free now
   mi_assert_internal(mi_page_thread_free(page) == NULL);
@@ -460,7 +460,7 @@ void mi_heap_delete(mi_heap_t* heap)
     // transfer still used pages to the backing heap
     mi_heap_absorb(bheap, heap);
   }
-  else 
+  else
   */
   {
     // abandon all pages
diff --git a/src/init.c b/src/init.c
index 177ca2bd..5159941a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = {
   { 0, 0 },               // keys
   #endif
   NULL,                   // xheap
-  NULL, NULL,             // next, prev  
+  NULL, NULL,             // next, prev
   MI_MEMID_STATIC         // memid
 };
 
@@ -103,7 +103,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   NULL,                   // heap_backing
-  NULL,                   // heaps list  
+  NULL,                   // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -139,7 +139,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
   0,                      // thread_seq
   &subproc_main,          // subproc
   &heap_main,             // heap_backing
-  &heap_main,             // heaps list  
+  &heap_main,             // heaps list
   0,                      // heartbeat
   false,                  // recurse
   false,                  // is_in_threadpool
@@ -165,7 +165,7 @@ mi_decl_cache_align mi_heap_t heap_main = {
   #endif
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  MI_MEMID_STATIC         
+  MI_MEMID_STATIC
 };
 
 
@@ -237,7 +237,7 @@ static void mi_tld_main_init(void) {
 
 // Initialization of the (statically allocated) main heap, and the main tld and subproc.
 static void mi_heap_main_init(void) {
-  if (heap_main.cookie == 0) {   
+  if (heap_main.cookie == 0) {
     mi_subproc_main_init();
     mi_tld_main_init();
     // heap
@@ -249,7 +249,7 @@ static void mi_heap_main_init(void) {
     #endif
     heap_main.cookie  = _mi_heap_random_next(&heap_main);
     heap_main.keys[0] = _mi_heap_random_next(&heap_main);
-    heap_main.keys[1] = _mi_heap_random_next(&heap_main);    
+    heap_main.keys[1] = _mi_heap_random_next(&heap_main);
     _mi_heap_guarded_init(&heap_main);
     heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0);
     heap_main.full_page_retain   = mi_option_get_clamp(mi_option_full_page_retain, -1, 32);
@@ -266,14 +266,21 @@ mi_heap_t* heap_main_get(void) {
   Thread local data
 ----------------------------------------------------------- */
 
-// Thread sequence number
-static _Atomic(size_t) mi_tcount;
+// Count current and total created threads
+static _Atomic(size_t)  thread_count = MI_ATOMIC_VAR_INIT(1);
+static _Atomic(size_t)  thread_total_count;
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
 
 // The mimalloc thread local data
-mi_decl_thread mi_tld_t* mi_tld;
+mi_decl_thread mi_tld_t* thread_tld = &tld_empty;
 
 // Allocate fresh tld
 static mi_tld_t* mi_tld_alloc(void) {
+  mi_atomic_increment_relaxed(&thread_count);
   if (_mi_is_main_thread()) {
     return &tld_main;
   }
@@ -292,7 +299,7 @@ static mi_tld_t* mi_tld_alloc(void) {
     tld->heaps = NULL;
     tld->subproc = &subproc_main;
     tld->thread_id = _mi_prim_thread_id();
-    tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1);
+    tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
     tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
     return tld;
   }
@@ -301,28 +308,38 @@ static mi_tld_t* mi_tld_alloc(void) {
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
 mi_decl_noinline static void mi_tld_free(void) {
-  mi_tld_t* tld = _mi_tld();
-  mi_tld = MI_TLD_INVALID;
-  _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  mi_tld_t* tld = _mi_tld();  
+  if (tld != NULL && tld != MI_TLD_INVALID) {
+    _mi_stats_done(&tld->stats);
+    _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
+  }
+  tld = MI_TLD_INVALID;
+  mi_atomic_decrement_relaxed(&thread_count);
 }
 
 mi_decl_noinline mi_tld_t* _mi_tld(void) {
-  if (mi_tld == MI_TLD_INVALID) {
-    _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n");
-    mi_tld = NULL;
+  mi_tld_t* tld = thread_tld;
+  if (tld == MI_TLD_INVALID) {
+    _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
+    thread_tld = &tld_empty;
   }
-  if (mi_tld==NULL) {
-    mi_tld = mi_tld_alloc();
+  if (tld==&tld_empty) {
+    thread_tld = tld = mi_tld_alloc();
   }  
-  return mi_tld;
+  return tld;
 }
 
 mi_subproc_t* _mi_subproc(void) {
-  if (_mi_is_main_thread()) {  // during initialization we should not recurse over reading the _mi_tld
-    return &subproc_main;  
+  // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()`
+  // todo: this will still fail on OS systems where the first access to a thread-local causes allocation.
+  //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
+  //       stored in a TLS slot for example)
+  mi_heap_t* heap = mi_prim_get_default_heap();
+  if (heap == NULL || heap == &_mi_heap_empty) {
+    return _mi_subproc_main();
   }
   else {
-    return _mi_tld()->subproc;
+    return thread_tld->subproc;  // don't call `_mi_tld()`
   }
 }
 
@@ -396,11 +413,11 @@ static bool _mi_thread_heap_init(void) {
     //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap());
   }
   else {
-    // allocates tld data 
-    // note: we cannot access thread-locals yet as that can cause (recursive) allocation 
+    // allocates tld data
+    // note: we cannot access thread-locals yet as that can cause (recursive) allocation
     // (on macOS <= 14 for example where the loader allocates thread-local data on demand).
-    mi_tld_t* tld = mi_tld_alloc();  
-    
+    mi_tld_t* tld = mi_tld_alloc();
+
     // allocate and initialize the heap
     mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld);
 
@@ -409,7 +426,7 @@ static bool _mi_thread_heap_init(void) {
     _mi_heap_set_default_direct(heap);
 
     // now that the heap is set for this thread, we can set the thread-local tld.
-    mi_tld = tld;
+    thread_tld = tld;
   }
   return false;
 }
@@ -444,9 +461,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) {
     _mi_heap_collect_abandon(heap);
   }
 
-  // merge stats
-  _mi_stats_done(&heap->tld->stats);
-
   // free heap meta data
   _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid);
 
@@ -494,11 +508,6 @@ bool _mi_is_main_thread(void) {
   return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id());
 }
 
-static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
-
-size_t  _mi_current_thread_count(void) {
-  return mi_atomic_load_relaxed(&thread_count);
-}
 
 // This is called from the `mi_malloc_generic`
 void mi_thread_init(void) mi_attr_noexcept
@@ -511,8 +520,7 @@ void mi_thread_init(void) mi_attr_noexcept
   //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
   if (_mi_thread_heap_init()) return;  // returns true if already initialized
 
-  _mi_stat_increase(&_mi_stats_main.threads, 1);
-  mi_atomic_increment_relaxed(&thread_count);
+  mi_subproc_stat_increase(_mi_subproc_main(), threads, 1);  
   //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }
 
@@ -534,15 +542,14 @@ void _mi_thread_done(mi_heap_t* heap)
   }
 
   // adjust stats
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+  mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1);
 
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
   _mi_thread_heap_done(heap);  // returns true if already ran
-
+  
   // free thread local data
   mi_tld_free();
 }
@@ -654,7 +661,7 @@ void mi_process_init(void) mi_attr_noexcept {
   _mi_prim_thread_associate_default_heap(NULL);
   #endif
 
-  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)  
   mi_track_init();
 
   if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
diff --git a/src/os.c b/src/os.c
index 86ecb16b..53e8f571 100644
--- a/src/os.c
+++ b/src/os.c
@@ -114,9 +114,9 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed) {
     _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr);
   }
   if (still_committed) {
-    _mi_stat_decrease(&os_stats->committed, size);
+    mi_os_stat_decrease(committed, size);
   }
-  _mi_stat_decrease(&os_stats->reserved, size);
+  mi_os_stat_decrease(reserved, size);
 }
 
 void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) {
@@ -171,11 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
     _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large);
   }
 
-  _mi_stat_counter_increase(&os_stats->mmap_calls, 1);
+  mi_os_stat_counter_increase(mmap_calls, 1);
   if (p != NULL) {
-    _mi_stat_increase(&os_stats->reserved, size);
+    mi_os_stat_increase(reserved, size);
     if (commit) {
-      _mi_stat_increase(&os_stats->committed, size);
+      mi_os_stat_increase(committed, size);
       // seems needed for asan (or `mimalloc-test-api` fails)
       #ifdef MI_TRACK_ASAN
       if (*is_zero) { mi_track_mem_defined(p,size); }
@@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (size == 0) return NULL;
   size = _mi_os_good_alloc_size(size);
   alignment = _mi_align_up(alignment, _mi_os_page_size());
-  
+
   bool os_is_large = false;
   bool os_is_zero  = false;
   void* os_base = NULL;
@@ -379,8 +379,8 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t*
 
 bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
   if (is_zero != NULL) { *is_zero = false; }
-  _mi_stat_increase(&os_stats->committed, size);  // use size for precise commit vs. decommit
-  _mi_stat_counter_increase(&os_stats->commit_calls, 1);
+  mi_os_stat_increase(committed, size);  // use size for precise commit vs. decommit
+  mi_os_stat_counter_increase(commit_calls, 1);
 
   // page align range
   size_t csize;
@@ -408,7 +408,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) {
 
 static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) {
   mi_assert_internal(needs_recommit!=NULL);
-  _mi_stat_decrease(&os_stats->committed, size);
+  mi_os_stat_decrease(committed, size);
 
   // page align
   size_t csize;
@@ -440,8 +440,8 @@ bool _mi_os_reset(void* addr, size_t size) {
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
-  _mi_stat_increase(&os_stats->reset, csize);
-  _mi_stat_counter_increase(&os_stats->reset_calls, 1);
+  mi_os_stat_increase(reset, csize);
+  mi_os_stat_counter_increase(reset_calls, 1);
 
   #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN
   memset(start, 0, csize); // pretend it is eagerly reset
@@ -460,8 +460,8 @@ bool _mi_os_reset(void* addr, size_t size) {
 bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
 {
   if (mi_option_get(mi_option_purge_delay) < 0) return false;  // is purging allowed?
-  _mi_stat_counter_increase(&os_stats->purge_calls, 1);
-  _mi_stat_increase(&os_stats->purged, size);
+  mi_os_stat_counter_increase(purge_calls, 1);
+  mi_os_stat_increase(purged, size);
 
   if (mi_option_is_enabled(mi_option_purge_decommits) &&   // should decommit?
     !_mi_preloading())                                     // don't decommit during preloading (unsafe)
@@ -595,8 +595,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
 
     // success, record it
     page++;  // increase before timeout check (see issue #711)
-    _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE);
-    _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE);
+    mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE);
 
     // check for timeout
     if (max_msecs > 0) {
diff --git a/src/page.c b/src/page.c
index 0444b47e..31dbcc7d 100644
--- a/src/page.c
+++ b/src/page.c
@@ -387,9 +387,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   const size_t bsize = mi_page_block_size(page);
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
-      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
+      mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
       mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
@@ -554,7 +554,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   size_t page_size;
   //uint8_t* page_start =
   mi_page_area(page, &page_size);
-  mi_heap_stat_counter_increase(heap, pages_extended, 1);
+  mi_debug_heap_stat_counter_increase(heap, pages_extended, 1);
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
@@ -583,7 +583,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_heap_stat_increase(heap, page_committed, extend * bsize);
+  mi_debug_heap_stat_increase(heap, page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
@@ -709,8 +709,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
     page = next;
   } // for each page
 
-  mi_heap_stat_counter_increase(heap, searches, count);
-
+  mi_debug_heap_stat_counter_increase(heap, searches, count);
+  
   // set the page to the best candidate
   if (page_candidate != NULL) {
     page = page_candidate;
diff --git a/src/stats.c b/src/stats.c
index bb17b936..2a395ed5 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -19,88 +19,93 @@ terms of the MIT license. A copy of the license can be found in the file
   Statistics operations
 ----------------------------------------------------------- */
 
-static bool mi_is_in_main(void* stat) {
-  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
-         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  // add atomically
+  int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+  if (amount > 0) {
+    mi_atomic_addi64_relaxed(&stat->allocated, amount);
+  }
+  else {
+    mi_atomic_addi64_relaxed(&stat->freed, -amount);
+  }
 }
 
 static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
-    if (amount > 0) {
-      mi_atomic_addi64_relaxed(&stat->allocated,amount);
-    }
-    else {
-      mi_atomic_addi64_relaxed(&stat->freed, -amount);
-    }
+  // add thread local
+  stat->current += amount;
+  if (stat->current > stat->peak) stat->peak = stat->current;
+  if (amount > 0) {
+    stat->allocated += amount;
   }
   else {
-    // add thread local
-    stat->current += amount;
-    if (stat->current > stat->peak) stat->peak = stat->current;
-    if (amount > 0) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += -amount;
-    }
+    stat->freed += -amount;
   }
 }
 
+
 // Adjust stats to compensate; for example before committing a range,
 // first adjust downwards with parts that were already committed so 
 // we avoid double counting.
+static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
+  if (amount == 0) return;
+  // adjust atomically 
+  mi_atomic_addi64_relaxed(&stat->current, amount);
+  mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+}
+
 static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
-  if mi_unlikely(mi_is_in_main(stat))
-  {
-    // adjust atomically 
-    mi_atomic_addi64_relaxed(&stat->current, amount);
-    mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
+  stat->current += amount;
+  if (on_alloc) {
+    stat->allocated += amount;
   }
   else {
-    // don't affect the peak
-    stat->current += amount;    
-    if (on_alloc) {
-      stat->allocated += amount;
-    }
-    else {
-      stat->freed += amount;
-    }
+    stat->freed += amount;
   }
 }
 
-void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
-  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64_relaxed( &stat->count, 1 );
-    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
-  }
-  else {
-    stat->count++;
-    stat->total += amount;
-  }
+void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
+  mi_atomic_addi64_relaxed(&stat->count, 1);
+  mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount);
 }
 
-void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  stat->count++;
+  stat->total += amount;  
+}
+
+void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, (int64_t)amount);
+}
+void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, (int64_t)amount);
 }
 
-void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update_mt(stat, -((int64_t)amount));
+}
+void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
   mi_stat_update(stat, -((int64_t)amount));
 }
 
-void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc);
+}
+void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, (int64_t)amount, on_alloc);
 }
 
-void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
+  mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc);
+}
+void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) {
   mi_stat_adjust(stat, -((int64_t)amount), on_alloc);
 }
 
+
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
@@ -401,27 +406,29 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
 
 static mi_msecs_t mi_process_start; // = 0
 
-static mi_stats_t* mi_stats_get_default(void) {
-  mi_heap_t* heap = mi_heap_get_default();
-  return &heap->tld->stats;
+// return thread local stats
+static mi_stats_t* mi_get_tld_stats(void) {
+  return &_mi_tld()->stats;
 }
 
 static void mi_stats_merge_from(mi_stats_t* stats) {
-  if (stats != &_mi_stats_main) {
-    mi_stats_add(&_mi_stats_main, stats);
-    memset(stats, 0, sizeof(mi_stats_t));
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) {
+    mi_stats_add(&subproc->stats, stats);
+    _mi_memzero(stats, sizeof(mi_stats_t));
   }
 }
 
 void mi_stats_reset(void) mi_attr_noexcept {
-  mi_stats_t* stats = mi_stats_get_default();
-  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
-  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_stats_t* stats = mi_get_tld_stats();
+  mi_subproc_t* subproc = _mi_subproc();
+  if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); }
+  _mi_memzero(&subproc->stats, sizeof(mi_stats_t));
   if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }
 
 void mi_stats_merge(void) mi_attr_noexcept {
-  mi_stats_merge_from( mi_stats_get_default() );
+  mi_stats_merge_from( mi_get_tld_stats() );
 }
 
 void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
@@ -429,8 +436,8 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }
 
 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, out, arg);
+  mi_stats_merge_from(mi_get_tld_stats());
+  _mi_stats_print(&_mi_subproc()->stats, out, arg);
 }
 
 void mi_stats_print(void* out) mi_attr_noexcept {
@@ -439,7 +446,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }
 
 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), out, arg);
+  _mi_stats_print(mi_get_tld_stats(), out, arg);
 }
 
 
@@ -473,11 +480,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 
 mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
 {
+  mi_subproc_t* subproc = _mi_subproc();
   mi_process_info_t pinfo;
   _mi_memzero_var(pinfo);
   pinfo.elapsed        = _mi_clock_end(mi_process_start);
-  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
-  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current)));
+  pinfo.peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak)));
   pinfo.current_rss    = pinfo.current_commit;
   pinfo.peak_rss       = pinfo.peak_commit;
   pinfo.utime          = 0;
diff --git a/test/test-stress.c b/test/test-stress.c
index b35743df..0920a02e 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -48,10 +48,10 @@ static int ITER    = 20;
 static int THREADS = 32;
 static int SCALE   = 50;
 static int ITER    = 50;
-#elif 0
-static int THREADS = 64;
-static int SCALE = 400;
-static int ITER = 10;
+#elif 1
+static int THREADS = 32;
+static int SCALE   = 25;
+static int ITER    = 50;
 #define ALLOW_LARGE true
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors