diff --git a/src/arena.c b/src/arena.c
index bc885ef8..19815616 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -199,7 +199,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
   void* p = mi_arena_slice_start(arena, slice_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count);
   memid->is_pinned = arena->memid.is_pinned;
-  
+
   // set the dirty bits
   if (arena->memid.initially_zero) {
     // size_t dirty_count = 0;
@@ -239,7 +239,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
             memid->initially_zero = false;
           }
         }
-        #endif  
+        #endif
         size_t already_committed_count = 0;
         mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count);
         if (already_committed_count < slice_count) {
@@ -247,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
           mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count));
         }
       }
-    }    
+    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -282,8 +282,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
   if (arena_count >= 1 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries 
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); 
+    // scale up the arena sizes exponentially every 8 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
@@ -399,7 +399,7 @@ again:
   if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
     mi_arena_id_t arena_id = 0;
     bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id);
-    mi_lock_release(&mi_arena_reserve_lock);    
+    mi_lock_release(&mi_arena_reserve_lock);
     if (ok) {
       // and try allocate in there
       mi_assert_internal(req_arena_id == _mi_arena_id_none());
@@ -476,6 +476,19 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t
   Arena page allocation
 ----------------------------------------------------------- */
 
+static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) {
+  mi_arena_t* arena = (mi_arena_t*)arg1;
+  mi_subproc_t* subproc = (mi_subproc_t*)arg2;
+
+  // found an abandoned page of the right size
+  // it is set busy for now so we can read safely even with concurrent mi_free reclaiming
+  // try to claim ownership atomically
+  mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+  if (subproc != page->subproc)           return false;
+  if (!mi_page_try_claim_ownership(page)) return false;
+  return true;
+}
+
 static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
   MI_UNUSED(slice_count);
@@ -493,38 +506,29 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
   {
     size_t slice_index;
     mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin];
-    while (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index)) {  // todo: don't restart from scratch if we fail for some entry?
-      // found an abandoned page of the right size
-      // it is set busy for now so we can read safely even with concurrent mi_free reclaiming
-      // try to claim ownership atomically
-      mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
-      if (!mi_page_try_claim_ownership(page)) {
-        // a concurrent free already grabbed the page.
-        // Restore the abandoned_map to make it available again (unblocking busy waiters)
-        mi_pairmap_set(pairmap, slice_index);
-      }
-      else {
-        // we got ownership, clear the abandoned entry (unblocking busy waiters)
-        mi_pairmap_clear(pairmap, slice_index);
-        mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
-        _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
-        _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
 
-        _mi_page_free_collect(page, false);  // update `used` count
-        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
-        mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
-        mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
-        mi_assert_internal(_mi_ptr_page(page)==page);
-        mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
-        mi_assert_internal(mi_page_block_size(page) == block_size);
-        mi_assert_internal(mi_page_is_abandoned(page));
-        mi_assert_internal(mi_page_is_owned(page));
-        mi_assert_internal(!mi_page_is_full(page));
-        return page;
-      }
-    }
+    if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) {  
+      // found an abandoned page of the right size 
+      // and claimed ownership.
+      mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
+      mi_assert_internal(mi_page_is_owned(page));
+      mi_assert_internal(mi_page_is_abandoned(page));
+      mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
+      _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1);
+      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1);
+
+      _mi_page_free_collect(page, false);  // update `used` count
+      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count));
+      mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
+      mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+      mi_assert_internal(_mi_ptr_page(page)==page);
+      mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page);
+      mi_assert_internal(mi_page_block_size(page) == block_size);
+      mi_assert_internal(!mi_page_is_full(page));
+      return page;
+    }    
   }
   mi_forall_arenas_end();
   return NULL;
@@ -565,8 +569,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
   mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));
 
   // claimed free slices: initialize the page partly
-  if (!memid.initially_zero) { 
-    _mi_memzero_aligned(page, sizeof(*page)); 
+  if (!memid.initially_zero) {
+    _mi_memzero_aligned(page, sizeof(*page));
   }
   #if MI_DEBUG > 1
   else {
@@ -779,7 +783,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) {
     mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count));
 
     // this busy waits until a concurrent reader (from alloc_abandoned) is done
-    mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index);
+    mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index);
     mi_page_clear_abandoned_mapped(page);
     mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]);
   }
@@ -999,7 +1003,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t*
     mi_atomic_decrement_acq_rel(&mi_arena_count);
     return false;
   }
-  
+
   _mi_stat_counter_increase(&stats->arena_count,1);
   arena->id = mi_arena_id_create(i);
   mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
@@ -1049,7 +1053,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
     // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
     _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB);
     return false;
-  }  
+  }
   size_t bitmap_base;
   const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base);
   if (slice_count < info_slices+1) {
diff --git a/src/bitmap.c b/src/bitmap.c
index 2dbba52d..1aa0a822 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -995,13 +995,13 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, s
 
 // Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
 // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {  
+static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal((idx%2)==0);
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
-  
+
   if (set) {
     // first set the chunkmap since it is a conservative approximation (increases epoch)
     mi_bitmap_chunkmap_set(bitmap, chunk_idx);
@@ -1066,7 +1066,7 @@ static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t
   mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); 
+  mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS);
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx);
 }
@@ -1091,13 +1091,13 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
 /* --------------------------------------------------------------------------------
   bitmap try_find_and_clear
 -------------------------------------------------------------------------------- */
-
+/*
 typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx);
 
 static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun)
 {
   if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false;
-  
+
   // start chunk index -- todo: can depend on the tseq to decrease contention between threads
   MI_UNUSED(tseq);
   const size_t chunk_start = 0;
@@ -1105,7 +1105,7 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq
   const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS;
 
   // for each chunkmap entry `i`
-  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) 
+  for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++)
   {
     size_t i = (_i + chunk_map_start);
     if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  // adjust for the start position
@@ -1122,50 +1122,106 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq
       if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; }
       // set the chunk idx
       const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift;
-      
+
       // try to find and clear N bits in that chunk
       if (chunk_idx < mi_bitmap_chunk_count(bitmap)) {   // we can have less chunks than in the chunkmap..
         if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) {
           return true;
         }
       }
-            
+
       // skip to the next bit
       cmap_idx_shift += cmap_idx+1;
       cmap >>= cmap_idx;            // skip scanned bits (and avoid UB for `cmap_idx+1`)
       cmap >>= 1;
     }
   }
-  
+
   return false;
 }
+*/
 
-static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-  size_t cidx;
-  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
-    return true;
-  }
-  else {
-    // we may find that all are cleared only on a second iteration but that is ok as
-    // the chunkmap is a conservative approximation.
-    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-    }
-    return false;
-  }
-}
+#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \
+  { \
+  /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
+  MI_UNUSED(tseq); \
+  const size_t chunk_start = 0; \
+  const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \
+  const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \
+  /* for each chunkmap entry `i` */ \
+  for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \
+    size_t i = (_i + chunk_map_start); \
+    if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count;  /* adjust for the start position */ \
+    \
+    const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \
+    mi_epoch_t name_epoch; \
+    mi_cmap_t  cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \
+    if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); }   /* rotate right for the start position (on the first iteration) */ \
+    \
+    uint32_t cmap_idx;             /* one bit set of each chunk that may have bits set */ \
+    size_t   cmap_idx_shift = 0;   /* shift through the cmap */ \
+    while (mi_bsf32(cmap, &cmap_idx)) {     /* find least bit that is set */ \
+      /* adjust for the start position again */ \
+      if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \
+      /* set the chunk idx */ \
+      const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \
+      /* try to find and clear N bits in that chunk */ \
+      if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) {   /* we can have less chunks than in the chunkmap.. */ 
+
+#define mi_bitmap_forall_chunks_end() \
+      } \
+      /* skip to the next bit */ \
+      cmap_idx_shift += cmap_idx+1; \
+      cmap >>= cmap_idx;            /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \
+      cmap >>= 1; \
+    } \
+  }}
+   
+//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
+//  size_t cidx;
+//  if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+//    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+//    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+//    return true;
+//  }
+//  else {
+//    // we may find that all are cleared only on a second iteration but that is ok as
+//    // the chunkmap is a conservative approximation.
+//    if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+//      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+//    }
+//    return false;
+//  }
+//}
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
 {
-  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
+  // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at);
+  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+      return true;
+    }
+    else {
+      // we may find that all are cleared only on a second iteration but that is ok as
+      // the chunkmap is a conservative approximation.
+      if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
+      }
+      // continue
+    }
+  }
+  mi_bitmap_forall_chunks_end();
+  return false;
 }
 
 /* --------------------------------------------------------------------------------
-  pairmap 
+  pairmap
 -------------------------------------------------------------------------------- */
 
 void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) {
@@ -1215,10 +1271,10 @@ bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) {
   pairmap clear while not busy
 -------------------------------------------------------------------------------- */
 
-static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
+static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
+  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
   mi_bfield_t bnew;
   mi_bfield_t old = mi_atomic_load_relaxed(b);
@@ -1238,32 +1294,32 @@ static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b
   return ((old&mask) == mask);
 }
 
-static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
+static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) {
   mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
   const size_t i = cidx / MI_BFIELD_BITS;
   const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx);
+  return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx);
 }
 
-static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) {
+static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) {
   mi_assert_internal((idx%2)==0);
   mi_assert_internal(idx < mi_bitmap_max_bits(bitmap));
   const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
   const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
   mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap));
   const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx);
-  bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx);
+  bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx);
   if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
     mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch);
-  }  
+  }
   return cleared;
 }
 
-void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
+void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
   mi_bitmap_t* bitmap;
   size_t idx;
   mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx);
-  mi_bitmap_clear2_while_not_busy(bitmap, idx);
+  mi_bitmap_clear2_once_not_busy(bitmap, idx);
 }
 
 
@@ -1274,9 +1330,9 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) {
 
 // Atomically go from set to busy, or return false otherwise and leave the bit field as-is.
 static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
+  mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set).
   mi_assert_internal(idx < MI_BFIELD_BITS-1);
-  const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx);
+  const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx);
   const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx);
   mi_bfield_t old;
   mi_bfield_t bnew;
@@ -1290,49 +1346,57 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t
 
 static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) {
   for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    size_t idx;
-    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy
-      mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set).
-      if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
-        *pidx = (i*MI_BFIELD_BITS) + idx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
-        return true;
+    while (true) {
+      const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits
+      size_t idx;
+      if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+        break; // not found: continue with the next field
+      }
+      else {
+        mi_assert_internal((idx%2)==0);
+        if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) {
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1);
+          return true;
+        }
+        // else: try this word once again
       }
     }
   }
   return false;
 }
 
-static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) {
-  MI_UNUSED(epoch); MI_UNUSED(n);
-  mi_assert_internal(n==2);
-  size_t cidx;
-  if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
-    *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-    mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
-    return true;
-  }
-  else {
-    return false;
-  }
-}
 
-static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
-  return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at);
+static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx,
+                                            mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) 
+{
+  mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx)
+  {
+    MI_UNUSED(epoch); MI_UNUSED(n);
+    mi_assert_internal(n==2);
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) {
+      const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal((idx%2)==0);
+      const size_t pair_idx = (idx + idx_offset)/2;
+      if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page
+        mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry
+        *ppair_idx = pair_idx;
+        return true;
+      }
+      else {
+        mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry
+        // and continue
+      }
+    }
+  }
+  mi_bitmap_forall_chunks_end();
+  return false;
 }
 
 // Used to find an abandoned page, and transition from set to busy.
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) {
-  size_t idx = 0;
-  if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) {
-    if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) {
-      return false;
-    }
-    else {
-      idx += mi_bitmap_max_bits(pairmap->bitmap1);
-    }
-  }
-  mi_assert_internal((idx%2)==0);
-  *pidx = idx/2;
-  return true;
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, 
+                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) {
+  if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true;
+  return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2);  
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index d73ee98a..ca62735b 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -13,9 +13,47 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #define MI_BITMAP_H
 
 /* --------------------------------------------------------------------------------
-  Definitions
--------------------------------------------------------------------------------- */
+  Atomic bitmaps:
 
+  `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`)
+      each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB).
+      We need 16K bits to represent a 1GiB arena.
+
+  `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512)
+      allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number
+      of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB).
+      These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions
+      to scan for bits (perhaps) more efficiently.
+
+   `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set.
+      The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set.
+      This is used to avoid scanning every chunk. (and thus strictly an optimization)
+      It is conservative: it is fine to a bit in the chunk map even if the chunk turns out
+      to have no bits set.
+
+      When we (potentially) set a bit in a chunk, we first update the chunkmap.
+      However, when we clear a bit in a chunk, and the chunk is indeed all clear, we
+      cannot safely clear the bit corresponding to the chunk in the chunkmap since it
+      may race with another thread setting a bit in the same chunk (and we may clear the
+      bit even though a bit is set in the chunk which is not allowed).
+
+      To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch"
+      counter that is increased everytime a bit is set. We only clear a bit if the epoch
+      stayed the same over our clear operation (so we know no other thread in the mean
+      time set a bit in any of the chunks corresponding to the chunkmap).
+      Since increasing the epoch and setting a bit must be atomic, we use only half-word
+      bits (32) (we could use 128-bit atomics if needed since modern hardware supports this)
+
+   `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16)
+      and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB
+      The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but
+      each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's).
+
+   For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count
+   and pop-count (but we think it can be adapted work reasonably well on older hardware too)
+--------------------------------------------------------------------------------------------- */
+
+// A word-size bit field.
 typedef size_t mi_bfield_t;
 
 #define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
@@ -29,16 +67,18 @@ typedef size_t mi_bfield_t;
 #define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
 #define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
 
-// 512 bits on 64_bit
+// A bitmap chunk contains 512 bits of bfields on 64_bit  (256 on 32-bit)
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s {
   _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
 } mi_bitmap_chunk_t;
 
+
 // for now 32-bit epoch + 32-bit bit-set   (note: with ABA instructions we can double this)
 typedef uint64_t mi_chunkmap_t;
 typedef uint32_t mi_epoch_t;
 typedef uint32_t mi_cmap_t;
 
+
 #define MI_CHUNKMAP_BITS            (32)   // 1 chunkmap tracks 32 chunks
 
 #define MI_BITMAP_MAX_CHUNKMAPS     (16)
@@ -48,15 +88,18 @@ typedef uint32_t mi_cmap_t;
 #define MI_BITMAP_MAX_BIT_COUNT     (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  // 16 GiB arena
 #define MI_BITMAP_MIN_BIT_COUNT     (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS)  //  1 GiB arena
 
+
+// An atomic bitmap
 typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)         chunk_map_count;
-  _Atomic(size_t)         chunk_count;        
+  _Atomic(size_t)         chunk_map_count; // valid chunk_map's
+  _Atomic(size_t)         chunk_count;     // total count of chunks
   size_t                  padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   _Atomic(mi_chunkmap_t)  chunk_maps[MI_BITMAP_MAX_CHUNKMAPS];
-  
+
   mi_bitmap_chunk_t       chunks[MI_BITMAP_MIN_BIT_COUNT];  // or more, up to MI_BITMAP_MAX_CHUNK_COUNT
 } mi_bitmap_t;
 
+
 static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) {
   return mi_atomic_load_relaxed(&bitmap->chunk_map_count);
 }
@@ -72,17 +115,19 @@ static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) {
 
 
 /* --------------------------------------------------------------------------------
-  Atomic bitmap
+  Atomic bitmap operations
 -------------------------------------------------------------------------------- */
 
+// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing)
 typedef bool  mi_xset_t;
 #define MI_BIT_SET    (true)
 #define MI_BIT_CLEAR  (false)
 
 
+// Required size of a bitmap to represent `bit_count` bits.
 size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count);
 
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true
 // returns the size of the bitmap.
 size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero);
 
@@ -134,56 +179,46 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
 
 
 
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-// and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
-//
-//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx);
-//}
-//
-//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx);
-//}
-
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx);
-//
-//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx);
-//}
-//
-//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) {
-//  return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx);
-//}
-
-
 /* --------------------------------------------------------------------------------
-  Atomic bitmap for a pair of bits
+  Atomic bitmap for a pair of bits.
+
+  The valid pairs are CLEAR (0), SET (3), or BUSY (2).
+
+  These bit pairs are used in the abandoned pages maps: when set, the entry has
+  an available page. When we scan for an available abandoned page and find an entry SET,
+  we first set it to BUSY, and try to claim the page atomically (since it can race
+  with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`,
+  we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page
+  just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon`
+  which (busy) waits until the BUSY flag is cleared to ensure all readers are done.
+  (and pair-bit operations must therefore be release_acquire).
 -------------------------------------------------------------------------------- */
 
 #define MI_PAIR_CLEAR   (0)
-#define MI_PAIR_BUSY    (1)
-#define MI_PAIR_UNUSED  (2)   // should never occur
+#define MI_PAIR_UNUSED  (1)   // should never occur
+#define MI_PAIR_BUSY    (2)
 #define MI_PAIR_SET     (3)
 
+// 0b....0101010101010101
+#define MI_BFIELD_LO_BIT2     ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8)
+
+// A pairmap manipulates pairs of bits (and consists of 2 bitmaps)
 typedef struct mi_pairmap_s {
   mi_bitmap_t* bitmap1;
-  mi_bitmap_t* bitmap2;  
+  mi_bitmap_t* bitmap2;
 } mi_pairmap_t;
 
-
-
-// initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true
+// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true
 void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2);
 bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx);
 bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx);
 bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx);
-void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
-mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx);
+void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx);
+
+typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2);
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx,
+                                                        mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2
+                                                       );
 
 
-#endif // MI_XBITMAP_H
+#endif // MI_BITMAP_H
diff --git a/src/free.c b/src/free.c
index 70ef5d8a..1e07dbd2 100644
--- a/src/free.c
+++ b/src/free.c
@@ -148,15 +148,44 @@ void mi_free(void* p) mi_attr_noexcept
 }
 
 
-
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 
+static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page);
+
+// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // adjust stats (after padding check and potentially recursive `mi_free` above)
+  mi_stat_free(page, block);    // stat_free may access the padding
+  mi_track_free_size(block, mi_page_usable_size_of(page, block));
+
+  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
+  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
+  size_t dbgsize = mi_usable_size(block);
+  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
+  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
+  #endif
+
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    mi_block_set_next(page, block, mi_tf_block(tf_old));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
+
+  // and atomically reclaim the page if it was abandoned
+  bool reclaimed = !mi_tf_is_owned(tf_old);
+  if (reclaimed) {
+    mi_free_try_reclaim_mt(page);
+  }
+}
+
 static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
   mi_assert_internal(mi_page_is_owned(page));
   mi_assert_internal(mi_page_is_abandoned(page));
-#if 1
   // we own the page now..
   // safe to collect the thread atomic free list
   _mi_page_free_collect(page, false);  // update `used` count
@@ -209,237 +238,8 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
 
   // not reclaimed or free'd, unown again
   _mi_page_unown(page);
-
-#else
-  if (!mi_page_is_abandoned_mapped(page)) {
-    // singleton or OS allocated
-    if (mi_page_is_singleton(page)) {
-      // free singleton pages
-      #if MI_DEBUG>1
-      _mi_page_free_collect(page, false);  // update `used` count
-      mi_assert_internal(mi_page_all_free(page));
-      #endif
-      // we can free the page directly
-      _mi_arena_page_free(page);
-      return;
-    }
-    else {
-      const bool was_full = mi_page_is_full(page);
-      _mi_page_free_collect(page,false); // update used
-      if (mi_page_all_free(page)) {
-        // no need to unabandon as it is unmapped
-        _mi_arena_page_free(page);
-        return;
-      }
-      else if (was_full && _mi_arena_page_reabandon_full(page)) {
-        return;
-      }
-      else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) {
-        // the page has still some blocks in use (but not too many)
-        // reclaim in our heap if compatible, or otherwise abandon again
-        // todo: optimize this check further?
-        // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
-        // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-        mi_heap_t* const heap = mi_prim_get_default_heap();
-        if (heap != (mi_heap_t*)&_mi_heap_empty) {       // we did not already terminate our thread (can this happen?
-          mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-          if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-              (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-              (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-              )
-          {
-            _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
-            // make it part of our heap (no need to unabandon as is unmapped)
-            _mi_heap_page_reclaim(tagheap, page);
-            return;
-          }
-        }
-      }
-    }
-  }
-  else {
-    // don't reclaim pages that can be found for fresh page allocations
-  }
-
-  // not reclaimed or free'd, unown again
-  _mi_page_unown(page);
-#endif
 }
 
-/*
-// we own the page now..
-// safe to collect the thread atomic free list
-_mi_page_free_collect(page, false);  // update `used` count
-if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); }
-
-if (mi_page_all_free(page)) {
-  // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-  _mi_arena_page_unabandon(page);  // this must be before free'ing
-  // we can free the page directly
-  _mi_arena_page_free(page);
-  return;
-}
-else if (!mi_page_is_mostly_used(page)) {
-  // the page has still some blocks in use (but not too many)
-  // reclaim in our heap if compatible, or otherwise abandon again
-  // todo: optimize this check further?
-  // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should
-  // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944)
-  mi_heap_t* const heap = mi_prim_get_default_heap();
-
-  if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed
-      (heap != (mi_heap_t*)&_mi_heap_empty))       // we did not already terminate our thread (can this happen?
-  {
-    mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag);
-    if ((tagheap != NULL) &&                         // don't reclaim across heap object types
-        (page->subproc == tagheap->tld->subproc) &&  // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? )
-        (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id))  // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?)
-        )
-    {
-      // first remove it from the abandoned pages in the arena -- this waits for any readers to finish
-      _mi_arena_page_unabandon(page);
-      _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1);
-      // make it part of our heap
-      _mi_heap_page_reclaim(tagheap, page);
-      return;
-    }
-  }
-}
-
-// we cannot reclaim this page.. leave it abandoned
-// todo: should re-abandon or otherwise a partly used page could never be re-used if the
-// objects in it are not freed explicitly.
-_mi_page_unown(page);
-*/
-
-
-// Push a block that is owned by another thread (or abandoned) on its page-local thread free list.
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page, block));
-
-  // _mi_padding_shrink(page, block, sizeof(mi_block_t));
-  #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-  size_t dbgsize = mi_usable_size(block);
-  if (dbgsize > MI_MiB) { dbgsize = MI_MiB; }
-  _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize);
-  #endif
-
-  // push atomically on the page thread free list
-  mi_thread_free_t tf_new;
-  mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    mi_block_set_next(page, block, mi_tf_block(tf_old));
-    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
-  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new));
-
-  // and atomically reclaim the page if it was abandoned
-  bool reclaimed = !mi_tf_is_owned(tf_old);
-  if (reclaimed) {
-    mi_free_try_reclaim_mt(page);
-  }
-}
-
-  /*
-  // Try to put the block on either the page-local thread free list,
-  // or the heap delayed free list (if this is the first non-local free in that page)
-  mi_thread_free_t tfreex;
-  bool use_delayed;
-  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
-  do {
-    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
-    if mi_unlikely(use_delayed) {
-      // unlikely: this only happens on the first concurrent free in a page that is in the full list
-      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
-    }
-    else {
-      // usual: directly add to page thread_free list
-      mi_block_set_next(page, block, mi_tf_block(tfree));
-      tfreex = mi_tf_set_block(tfree,block);
-    }
-  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-
-  // If this was the first non-local free, we need to push it on the heap delayed free list instead
-  if mi_unlikely(use_delayed) {
-    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
-    mi_assert_internal(heap != NULL);
-    if (heap != NULL) {
-      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
-      do {
-        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
-    }
-
-    // and reset the MI_DELAYED_FREEING flag
-    tfree = mi_atomic_load_relaxed(&page->xthread_free);
-    do {
-      tfreex = tfree;
-      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
-      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
-  }
-}
-
-// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
-{
-  // first see if the page was abandoned and if we can reclaim it into our thread
-  if (mi_page_is_abandoned(page)) {
-    if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
-      mi_page_is_singleton(page)) {  // only one block, and we are free-ing it
-      if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
-      {
-        // the page is abandoned, try to reclaim it into our heap
-        if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
-          mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
-          // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
-          mi_free(block);  // recursively free as now it will be a local free in our heap
-          return;
-        }
-        else {
-          if (mi_page_is_abandoned(page)) {
-            // mi_assert(false);
-          }
-          // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
-        }
-      }
-    }
-  }
-
-
-  // The padding check may access the non-thread-owned page for the key values.
-  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
-  mi_check_padding(page, block);
-
-  // adjust stats (after padding check and potentially recursive `mi_free` above)
-  mi_stat_free(page, block);    // stat_free may access the padding
-  mi_track_free_size(block, mi_page_usable_size_of(page,block));
-
-  // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
-  _mi_padding_shrink(page, block, sizeof(mi_block_t));
-
-  if (mi_page_is_huge(page)) {
-    mi_assert_internal(mi_page_is_singleton(page));
-    // huge pages are special as they occupy the entire segment
-    // as these are large we reset the memory occupied by the page so it is available to other threads
-    // (as the owning thread needs to actually free the memory later).
-    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively
-  }
-  else {
-    #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
-    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
-    #endif
-  }
-
-  // and finally free the actual block by pushing it on the owning heap
-  // thread_delayed free list (or heap delayed free list)
-  mi_free_block_delayed_mt(page,block);
-}
-*/
 
 // ------------------------------------------------------
 // Usable size
diff --git a/src/page.c b/src/page.c
index e5e3f972..faef2f48 100644
--- a/src/page.c
+++ b/src/page.c
@@ -44,7 +44,7 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
   mi_assert_internal(_mi_ptr_page(page) == page);
   size_t count = 0;
   while (head != NULL) {
-    mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
     count++;
     head = mi_block_next(page, head);
   }