diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 041e7653..e98a37f5 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -101,7 +101,6 @@ size_t        _mi_current_thread_count(void);
 bool          _mi_preloading(void);           // true while the C runtime is not initialized yet
 void          _mi_thread_done(mi_heap_t* heap);
 
-mi_tld_t*     _mi_tld(void);                  // current tld: `_mi_tld() == _mi_heap_get_default()->tld`
 mi_subproc_t* _mi_subproc(void);
 mi_subproc_t* _mi_subproc_main(void);
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
@@ -148,8 +147,8 @@ void*         _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit,
 void*         _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
 void          _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
 bool          _mi_arenas_contain(const void* p);
-void          _mi_arenas_collect(bool force_purge);
-void          _mi_arenas_unsafe_destroy_all(void);
+void          _mi_arenas_collect(bool force_purge, mi_tld_t* tld);
+void          _mi_arenas_unsafe_destroy_all(mi_tld_t* tld);
 
 mi_page_t*    _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
 void          _mi_arenas_page_free(mi_page_t* page);
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 99791585..2d681062 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -207,6 +207,20 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 }
 
+#elif 0 && _MSC_VER && _WIN32
+// On Windows, using a fixed TLS slot has better codegen than a thread-local 
+// but it might clash with an application trying to use the same slot. (so we disable this by default)
+#include <winternl.h>
+
+#define MI_HAS_TLS_SLOT
+#define MI_TLS_SLOT       63  // last available slot
+
+static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
+  return NtCurrentTeb()->TlsSlots[slot];
+}
+static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  NtCurrentTeb()->TlsSlots[slot] = value;
+}
 #endif
 
 // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id
diff --git a/src/arena.c b/src/arena.c
index 0f6388a9..306d9e5a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -352,6 +352,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
     mi_arena_t* name_arena; \
     if (req_arena != NULL) { \
       name_arena = req_arena; /* if there is a specific req_arena, only search that one */\
+      if (_i > 0) break;       /* only once */ \
     } \
     else { \
       size_t _idx; \
@@ -369,7 +370,6 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
 
 #define mi_forall_arenas_end()  \
     } \
-    if (req_arena != NULL) break; \
   } \
   }
 
@@ -923,7 +923,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) {
   Arena free
 ----------------------------------------------------------- */
 static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices);
-static void mi_arenas_try_purge(bool force, bool visit_all);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld);
 
 void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
   if (p==NULL) return;
@@ -979,12 +979,12 @@ void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) {
   }
 
   // try to purge expired decommits
-  mi_arenas_try_purge(false, false);
+  // mi_arenas_try_purge(false, false, NULL);
 }
 
 // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */);
+void _mi_arenas_collect(bool force_purge, mi_tld_t* tld) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, tld);
 }
 
 
@@ -1038,9 +1038,9 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) {
 
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arenas_unsafe_destroy_all(void) {
+void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld) {
   mi_arenas_unsafe_destroy(_mi_subproc());
-  _mi_arenas_collect(true /* force purge */);  // purge non-owned arenas
+  _mi_arenas_collect(true /* force purge */, tld);  // purge non-owned arenas
 }
 
 
@@ -1584,13 +1584,12 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force)
 }
 
 
-static void mi_arenas_try_purge(bool force, bool visit_all)
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld)
 {
   const long delay = mi_arena_purge_delay();
   if (_mi_preloading() || delay <= 0) return;  // nothing will be scheduled
 
   // check if any arena needs purging?
-  mi_tld_t* tld = _mi_tld();
   mi_subproc_t* subproc = tld->subproc;
   const mi_msecs_t now = _mi_clock_now();
   mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire);
@@ -1628,10 +1627,71 @@ static void mi_arenas_try_purge(bool force, bool visit_all)
   }
 }
 
+/* -----------------------------------------------------------
+  Visit abandoned pages
+----------------------------------------------------------- */
+
+typedef struct mi_abandoned_page_visit_info_s {
+  int heap_tag;
+  mi_block_visit_fun* visitor;
+  void* arg;
+  bool visit_blocks;
+} mi_abandoned_page_visit_info_t;
+
+static bool abandoned_page_visit(mi_page_t* page, mi_abandoned_page_visit_info_t* vinfo) {
+  if (page->heap_tag != vinfo->heap_tag) { return true; } // continue
+  mi_heap_area_t area;
+  _mi_heap_area_init(&area, page);
+  if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { 
+    return false; 
+  }
+  if (vinfo->visit_blocks) {
+    return _mi_heap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+static bool abandoned_page_visit_at(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) {
+  MI_UNUSED(slice_count);
+  mi_abandoned_page_visit_info_t* vinfo = (mi_abandoned_page_visit_info_t*)arg;
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  mi_assert_internal(mi_page_is_abandoned_mapped(page));
+  return abandoned_page_visit(page, vinfo); 
+}
+
+// Visit all abandoned pages in this subproc.
 bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_abandoned_page_visit_info_t visit_info = { heap_tag, visitor, arg, visit_blocks };
   MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg);
-  _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n");
-  return false;
+
+  // visit abandoned pages in the arenas
+  // we don't have to claim because we assume we are the only thread running (in this subproc).
+  // (but we could atomically claim as well by first doing abandoned_reclaim and afterwards reabandoning).
+  bool ok = true;
+  mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id);
+  mi_forall_arenas(subproc, NULL, 0, arena) {
+    mi_assert_internal(arena->subproc == subproc);
+    for (size_t bin = 0; ok && bin < MI_BIN_COUNT; bin++) {
+      // todo: if we had a single abandoned page map as well, this can be faster.
+      if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) > 0) {
+        ok = _mi_bitmap_forall_set(arena->pages_abandoned[bin], &abandoned_page_visit_at, arena, &visit_info);
+      }
+    }
+  }
+  mi_forall_arenas_end();
+  if (!ok) return false;
+
+  // visit abandoned pages in OS allocated memory
+  // (technically we don't need the lock as we assume we are the only thread running in this subproc)
+  mi_lock(&subproc->os_abandoned_pages_lock) {
+    for (mi_page_t* page = subproc->os_abandoned_pages; ok && page != NULL; page = page->next) {
+      ok = abandoned_page_visit(page, &visit_info);
+    }
+  }
+
+  return ok;
 }
 
 
@@ -1731,3 +1791,4 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* are
   return true;
 }
 
+
diff --git a/src/heap.c b/src/heap.c
index feb4b2a4..7c475bc5 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -122,7 +122,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
   mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
 
   // collect arenas (this is program wide so don't force purges on abandonment of threads)
-  _mi_arenas_collect(collect == MI_FORCE /* force purge? */);
+  _mi_arenas_collect(collect == MI_FORCE /* force purge? */, heap->tld);
 }
 
 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@@ -210,7 +210,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
   mi_assert(heap_tag >= 0 && heap_tag < 256);
   // allocate and initialize a heap
   mi_memid_t memid;
-  mi_heap_t* heap; 
+  mi_heap_t* heap;
   if (arena_id == _mi_arena_id_none()) {
     heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid);
   }
@@ -450,7 +450,7 @@ void mi_heap_delete(mi_heap_t* heap)
 
   // abandon all pages
   _mi_heap_collect_abandon(heap);
-  
+
   mi_assert_internal(heap->page_count==0);
   mi_heap_free(heap,true);
 }
@@ -477,7 +477,7 @@ void mi_heap_unload(mi_heap_t* heap) {
     _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n");
     return;
   }
-  
+
   // abandon all pages so all thread'id in the pages are cleared
   _mi_heap_collect_abandon(heap);
   mi_assert_internal(heap->page_count==0);
@@ -491,7 +491,7 @@ void mi_heap_unload(mi_heap_t* heap) {
 }
 
 bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
-  mi_assert(mi_heap_is_initialized(heap));  
+  mi_assert(mi_heap_is_initialized(heap));
   if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
   if (heap->exclusive_arena == NULL) {
     _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n");
@@ -509,8 +509,8 @@ bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) {
 
   mi_assert_internal(heap->page_count==0);
 
-  // re-associate from the current thread-local and static state
-  heap->tld = _mi_tld();
+  // re-associate with the current thread-local and static state
+  heap->tld = mi_heap_get_default()->tld;
 
   // reinit direct pages (as we may be in a different process)
   mi_assert_internal(heap->page_count == 0);
diff --git a/src/init.c b/src/init.c
index 98c1d7c9..1fc00404 100644
--- a/src/init.c
+++ b/src/init.c
@@ -310,17 +310,21 @@ static mi_tld_t* mi_tld_alloc(void) {
 
 #define MI_TLD_INVALID  ((mi_tld_t*)1)
 
-mi_decl_noinline static void mi_tld_free(void) {
-  mi_tld_t* tld = _mi_tld();
+mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) {
   if (tld != NULL && tld != MI_TLD_INVALID) {
     _mi_stats_done(&tld->stats);
     _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid);
   }
-  tld = MI_TLD_INVALID;
+  #if 0
+  // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage
+  // (since we are calling this during pthread shutdown)
+  // (and this could happen on other systems as well, so let's never do it)
+  thread_tld = MI_TLD_INVALID;
+  #endif
   mi_atomic_decrement_relaxed(&thread_count);
 }
 
-mi_decl_noinline mi_tld_t* _mi_tld(void) {
+static mi_tld_t* mi_tld(void) {
   mi_tld_t* tld = thread_tld;
   if (tld == MI_TLD_INVALID) {
     _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n");
@@ -338,11 +342,11 @@ mi_subproc_t* _mi_subproc(void) {
   //       on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being
   //       stored in a TLS slot for example)
   mi_heap_t* heap = mi_prim_get_default_heap();
-  if (heap == NULL || heap == &_mi_heap_empty) {
+  if (heap == NULL) {
     return _mi_subproc_main();
   }
   else {
-    return thread_tld->subproc;  // don't call `_mi_tld()`
+    return heap->tld->subproc;  // avoid using thread local storage (`thread_tld`)
   }
 }
 
@@ -396,7 +400,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) {
 }
 
 void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) {
-  mi_tld_t* tld = _mi_tld();
+  mi_tld_t* tld = mi_tld();
   if (tld == NULL) return;
   mi_assert(tld->subproc == &subproc_main);
   if (tld->subproc != &subproc_main) return;
@@ -554,10 +558,12 @@ void _mi_thread_done(mi_heap_t* heap)
   if (heap->tld->thread_id != _mi_prim_thread_id()) return;
 
   // abandon the thread local heap
+  // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage)
+  mi_tld_t* tld = heap->tld;
   _mi_thread_heap_done(heap);  // returns true if already ran
 
   // free thread local data
-  mi_tld_free();
+  mi_tld_free(tld);
 }
 
 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
@@ -714,7 +720,7 @@ void mi_cdecl _mi_process_done(void) {
   if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
     mi_collect(true /* force */);
     _mi_heap_unsafe_destroy_all();     // forcefully release all memory held by all heaps (of this thread only!)
-    _mi_arenas_unsafe_destroy_all();
+    _mi_arenas_unsafe_destroy_all(&tld_main);
   }
 
   if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
diff --git a/src/page.c b/src/page.c
index 7c8429a9..239d5d6e 100644
--- a/src/page.c
+++ b/src/page.c
@@ -252,7 +252,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   else {
     mi_page_queue_remove(pq, page);
     mi_page_set_heap(page, NULL);
-    _mi_arenas_page_abandon(page);
+    _mi_arenas_page_abandon(page);    
   }
 }
 
@@ -356,8 +356,10 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) {
   mi_page_queue_remove(pq, page);
 
   // and free it
+  mi_heap_t* heap = page->heap;
   mi_page_set_heap(page,NULL);
   _mi_arenas_page_free(page);
+  _mi_arenas_collect(false, heap->tld);  // allow purging
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
diff --git a/src/stats.c b/src/stats.c
index 102373ec..057dc093 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -47,11 +47,11 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
 
 
 // Adjust stats to compensate; for example before committing a range,
-// first adjust downwards with parts that were already committed so 
+// first adjust downwards with parts that were already committed so
 // we avoid double counting.
 static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) {
   if (amount == 0) return;
-  // adjust atomically 
+  // adjust atomically
   mi_atomic_addi64_relaxed(&stat->current, amount);
   mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount);
 }
@@ -74,7 +74,7 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) {
 
 void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
   stat->count++;
-  stat->total += amount;  
+  stat->total += amount;
 }
 
 void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) {
@@ -150,7 +150,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
   mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
   mi_stat_counter_add(&stats->searches, &src->searches, 1);
   mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
-  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);  
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
   mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1);
 #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
@@ -347,7 +347,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   #endif
   #if MI_STAT
   mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
-  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);  
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
   mi_stat_count_t total = { 0,0,0,0 };
   mi_stat_add(&total, &stats->normal, 1);
   mi_stat_add(&total, &stats->huge, 1);
@@ -408,7 +408,7 @@ static mi_msecs_t mi_process_start; // = 0
 
 // return thread local stats
 static mi_stats_t* mi_get_tld_stats(void) {
-  return &_mi_tld()->stats;
+  return &mi_heap_get_default()->tld->stats;
 }
 
 void mi_stats_reset(void) mi_attr_noexcept {
@@ -492,7 +492,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s
   pinfo.page_faults    = 0;
 
   _mi_prim_process_info(&pinfo);
-  
+
   if (elapsed_msecs!=NULL)  *elapsed_msecs  = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX));
   if (user_msecs!=NULL)     *user_msecs     = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX));
   if (system_msecs!=NULL)   *system_msecs   = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX));