diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fc1808e..5cb05840 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ set(mi_sources
     src/options.c
     src/os.c
     src/page.c
+    src/page-map.c
     src/random.c
     src/segment.c
     src/segment-map.c
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index 138acf39..3dd7326f 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -214,12 +214,7 @@
     </ClCompile>
     <ClCompile Include="..\..\src\alloc-posix.c" />
     <ClCompile Include="..\..\src\alloc.c" />
-    <ClCompile Include="..\..\src\arena-abandoned.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c" />
     <ClCompile Include="..\..\src\bitmap.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
     </ClCompile>
@@ -232,6 +227,7 @@
     <ClCompile Include="..\..\src\heap.c" />
     <ClCompile Include="..\..\src\init.c" />
     <ClCompile Include="..\..\src\libc.c" />
+    <ClCompile Include="..\..\src\page-map.c" />
     <ClCompile Include="..\..\src\prim\prim.c" />
     <ClCompile Include="..\..\src\prim\windows\prim.c">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
@@ -248,12 +244,8 @@
     </ClCompile>
     <ClCompile Include="..\..\src\page.c" />
     <ClCompile Include="..\..\src\random.c" />
-    <ClCompile Include="..\..\src\segment-map.c" />
-    <ClCompile Include="..\..\src\segment.c" />
     <ClCompile Include="..\..\src\os.c" />
     <ClCompile Include="..\..\src\stats.c" />
-    <ClCompile Include="..\..\src\xarena.c" />
-    <ClCompile Include="..\..\src\xbitmap.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h" />
diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters
index 48958be1..2eed7e90 100644
--- a/ide/vs2022/mimalloc.vcxproj.filters
+++ b/ide/vs2022/mimalloc.vcxproj.filters
@@ -43,12 +43,6 @@
     <ClCompile Include="..\..\src\random.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\segment.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\segment-map.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\stats.c">
       <Filter>Sources</Filter>
     </ClCompile>
@@ -58,13 +52,10 @@
     <ClCompile Include="..\..\src\free.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\arena-abandoned.c">
+    <ClCompile Include="..\..\src\arena.c">
       <Filter>Sources</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\xbitmap.c">
-      <Filter>Sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\xarena.c">
+    <ClCompile Include="..\..\src\page-map.c">
       <Filter>Sources</Filter>
     </ClCompile>
   </ItemGroup>
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 642f0f9c..ad7ea3e6 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -100,6 +100,10 @@ typedef int32_t  mi_ssize_t;
 #define __BMI1__  1
 #endif
 
+// Define big endian if needed
+// #define MI_BIG_ENDIAN  1
+
+
 /* --------------------------------------------------------------------------------
   Builtin's
 -------------------------------------------------------------------------------- */
@@ -310,4 +314,6 @@ static inline size_t mi_rotl(size_t x, size_t r) {
   #endif
 }
 
+
+
 #endif // MI_BITS_H
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index b997099e..2713c0ac 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -108,6 +108,7 @@ size_t     _mi_os_page_size(void);
 size_t     _mi_os_good_alloc_size(size_t size);
 bool       _mi_os_has_overcommit(void);
 bool       _mi_os_has_virtual_reserve(void);
+size_t     _mi_os_virtual_address_bits(void);
 
 bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
 bool       _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
@@ -136,12 +137,11 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
-bool       _mi_arena_segment_clear_abandoned(mi_segment_t* segment);
-void       _mi_arena_segment_mark_abandoned(mi_segment_t* segment);
 
 void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
 
+/*
 typedef struct mi_arena_field_cursor_s { // abstract struct
   size_t         os_list_count;           // max entries to visit in the OS abandoned list
   size_t         start;                   // start arena idx (may need to be wrapped)
@@ -154,27 +154,12 @@ typedef struct mi_arena_field_cursor_s { // abstract struct
 void          _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current);
 mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous);
 void          _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current);
+*/
 
-// "segment-map.c"
-void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
-void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+// "page-map.c"
+void       _mi_page_map_register(mi_page_t* page);
+void       _mi_page_map_unregister(mi_page_t* page);
 
-// "segment.c"
-mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
-void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
-void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
-
-#if MI_HUGE_PAGE_ABANDON
-void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#else
-void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
-#endif
-
-void       _mi_segments_collect(bool force, mi_segments_tld_t* tld);
-void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
-bool       _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment);
-bool       _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
 
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
@@ -226,7 +211,7 @@ void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz
 void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
-void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+// void        _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
 void        _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size);
 
 // "libc.c"
@@ -338,8 +323,8 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
 
 
 // Align a pointer upwards
-static inline void* mi_align_up_ptr(void* p, size_t alignment) {
-  return (void*)_mi_align_up((uintptr_t)p, alignment);
+static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) {
+  return (uint8_t*)_mi_align_up((uintptr_t)p, alignment);
 }
 
 
@@ -445,68 +430,44 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si
   return heap->pages_free_direct[idx];
 }
 
-// Segment that contains the pointer
-// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
-// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
-// therefore we align one byte before `p`.
-// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`.
-static inline mi_segment_t* _mi_ptr_segment(const void* p) {
-  mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
-  #if MI_INTPTR_SIZE <= 4
-  return (p==NULL ? NULL : segment);
-  #else
-  return ((intptr_t)segment <= 0 ? NULL : segment);
+
+extern signed char* _mi_page_map;
+
+#define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
+
+static inline mi_page_t* _mi_ptr_page(const void* p) {
+  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_BLOCK_SHIFT;
+  const ptrdiff_t ofs = _mi_page_map[up];
+  #if MI_DEBUG
+  if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
   #endif
+  return (mi_page_t*)((up + ofs - 1) << MI_ARENA_BLOCK_SHIFT);
 }
 
-// Segment belonging to a page
-static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
-  mi_assert_internal(page!=NULL);
-  mi_segment_t* segment = _mi_ptr_segment(page);
-  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
-  return segment;
-}
 
-// used internally
-static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
-  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
-  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */);
-  size_t idx = (size_t)diff >> segment->page_shift;
-  mi_assert_internal(idx < segment->capacity);
-  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
-  return idx;
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  size_t idx = _mi_segment_page_idx_of(segment, p);
-  return &((mi_segment_t*)segment)->pages[idx];
-}
-
-// Quick page start for initialized pages
-static inline uint8_t* mi_page_start(const mi_page_t* page) {
-  mi_assert_internal(page->page_start != NULL);
-  mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start);
-  return page->page_start;
-}
-
-// Get the page containing the pointer
-static inline mi_page_t* _mi_ptr_page(void* p) {
-  mi_assert_internal(p!=NULL);
-  return _mi_segment_page_of(_mi_ptr_segment(p), p);
-}
-
-// Get the block size of a page (special case for huge objects)
+// Get the block size of a page 
 static inline size_t mi_page_block_size(const mi_page_t* page) {
   mi_assert_internal(page->block_size > 0);
   return page->block_size;
 }
 
-static inline bool mi_page_is_huge(const mi_page_t* page) {
-  mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) ||
-                     (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE));
-  return page->is_huge;
+// Page start
+static inline uint8_t* mi_page_start(const mi_page_t* page) {
+  mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE);
+  return (uint8_t*)page + MI_PAGE_INFO_SIZE;
+}
+
+static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
+  if (size) { *size = mi_page_block_size(page) * page->reserved; }
+  return mi_page_start(page);
+}
+
+static inline bool mi_page_is_in_arena(const mi_page_t* page) {
+  return (page->memid.memkind == MI_MEM_ARENA);
+}
+
+static inline bool mi_page_is_singleton(const mi_page_t* page) {
+  return (page->reserved == 1);
 }
 
 // Get the usable block size of a page without fixed padding.
@@ -515,11 +476,6 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
-// size of a segment
-static inline size_t mi_segment_size(mi_segment_t* segment) {
-  return segment->segment_size;
-}
-
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
   return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
@@ -534,10 +490,20 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
   return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
 }
 
+static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
+  return mi_atomic_load_relaxed(&page->xthread_id);
+}
+
 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
   mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
-  if (heap != NULL) { page->heap_tag = heap->tag; }
+  if (heap != NULL) { 
+    page->heap_tag  = heap->tag; 
+    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+  }
+  else {
+    mi_atomic_store_release(&page->xthread_id,0);
+  }
 }
 
 // Thread free flag helpers
@@ -576,6 +542,21 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) {
   return (page->free != NULL);
 }
 
+
+// is the page not yet used up to its reserved space?
+static inline bool mi_page_is_expandable(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(page->capacity <= page->reserved);
+  return (page->capacity < page->reserved);
+}
+
+
+static inline bool mi_page_is_full(mi_page_t* page) {
+  bool full = (page->reserved == page->used);
+  mi_assert_internal(!full || page->free == NULL);
+  return full;
+}
+
 // is more than 7/8th of a page in use?
 static inline bool mi_page_mostly_used(const mi_page_t* page) {
   if (page==NULL) return true;
@@ -583,6 +564,15 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
   return (page->reserved - page->used <= frac);
 }
 
+static inline bool mi_page_is_abandoned(mi_page_t* page) {
+  return (mi_page_thread_id(page) == 0);
+}
+
+static inline bool mi_page_is_huge(mi_page_t* page) {
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE);
+}
+
+
 static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
   return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
 }
@@ -667,17 +657,8 @@ We also pass a separate `null` value to be used as `NULL` or otherwise
 `(k2<<<k1)+k1` would appear (too) often as a sentinel value.
 ------------------------------------------------------------------- */
 
-static inline bool mi_is_in_same_segment(const void* p, const void* q) {
-  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
-}
-
 static inline bool mi_is_in_same_page(const void* p, const void* q) {
-  mi_segment_t* segmentp = _mi_ptr_segment(p);
-  mi_segment_t* segmentq = _mi_ptr_segment(q);
-  if (segmentp != segmentq) return false;
-  size_t idxp = _mi_segment_page_idx_of(segmentp, p);
-  size_t idxq = _mi_segment_page_idx_of(segmentq, q);
-  return (idxp == idxq);
+  return (_mi_ptr_page(p) == _mi_ptr_page(q));
 }
 
 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
@@ -693,7 +674,7 @@ static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const
 static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) {
   const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys));
   // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951)
-  #ifdef MI_BIG_ENDIAN
+  #if MI_BIG_ENDIAN
   return (x & 0x00FFFFFF);
   #else
   return (x & 0xFFFFFF00);
@@ -749,6 +730,20 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
   #endif
 }
 
+/* -----------------------------------------------------------
+  arena blocks
+----------------------------------------------------------- */
+
+// Blocks needed for a given byte size
+static inline size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+// Byte size of a number of blocks
+static inline size_t mi_size_of_blocks(size_t bcount) {
+  return (bcount * MI_ARENA_BLOCK_SIZE);
+}
+
 
 /* -----------------------------------------------------------
   memory id's
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index e8705991..98664020 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -111,40 +111,29 @@ terms of the MIT license. A copy of the license can be found in the file
 // Main internal data-structures
 // ------------------------------------------------------
 
-// Main tuning parameters for segment and page sizes
-// Sizes for 64-bit, divide by two for 32-bit
-#ifndef MI_SMALL_PAGE_SHIFT
-#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64KiB
+// Sizes are for 64-bit
+#ifndef MI_ARENA_BLOCK_SHIFT
+#ifdef  MI_SMALL_PAGE_SHIFT  // compatibility
+#define MI_ARENA_BLOCK_SHIFT              MI_SMALL_PAGE_SHIFT
+#else
+#define MI_ARENA_BLOCK_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
-#ifndef MI_MEDIUM_PAGE_SHIFT
-#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512KiB
 #endif
-#ifndef MI_LARGE_PAGE_SHIFT
-#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4MiB
-#endif
-#ifndef MI_SEGMENT_SHIFT
-#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4MiB -- must be equal to `MI_LARGE_PAGE_SHIFT`
+#ifndef MI_BITMAP_CHUNK_BITS_SHIFT
+#define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
 #endif
 
-// Derived constants
-#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
-#define MI_SEGMENT_ALIGN                  (MI_SEGMENT_SIZE)
-#define MI_SEGMENT_MASK                   ((uintptr_t)(MI_SEGMENT_ALIGN - 1))
+#define MI_ARENA_BLOCK_SIZE               (MI_ZU(1) << MI_ARENA_BLOCK_SHIFT)   
+#define MI_ARENA_BLOCK_ALIGN              (MI_ARENA_BLOCK_SIZE)
+#define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (MI_ZU(1)<<MI_LARGE_PAGE_SHIFT)
+#define MI_ARENA_MIN_OBJ_SIZE             MI_ARENA_BLOCK_SIZE
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
 
-#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
-#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
-#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+#define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
+#define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
+#define MI_LARGE_PAGE_SIZE                (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE)       // 4 MiB    (=word in the bitmap)
 
-// The max object size are checked to not waste more than 12.5% internally over the page sizes.
-// (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16KiB
-#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB
-#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2MiB
-#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
 
 // Maximum number of size classes. (spaced exponentially in 12.5% increments)
 #define MI_BIN_HUGE  (73U)
@@ -152,18 +141,54 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 
 
-#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
-#error "mimalloc internal: define more bins"
-#endif
-
-// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`)
-#define MI_MAX_ALIGN_GUARANTEE   (MI_MEDIUM_OBJ_SIZE_MAX)
-
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_SEGMENT_SIZE >> 1)
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages
+#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_BLOCK_ALIGN)
 
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-#define MI_MAX_ALLOC_SIZE   PTRDIFF_MAX
+#define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX
+
+
+// ---------------------------------------------------------------
+// a memory id tracks the provenance of arena/OS allocated memory
+// ---------------------------------------------------------------
+
+// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
+typedef enum mi_memkind_e {
+  MI_MEM_NONE,      // not allocated
+  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
+  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
+  MI_MEM_OS,        // allocated from the OS
+  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
+  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`) 
+  MI_MEM_ARENA      // allocated from an arena (the usual case) 
+} mi_memkind_t;
+
+static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
+  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
+}
+
+typedef struct mi_memid_os_info {
+  void* base;                       // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
+} mi_memid_os_info_t;
+
+typedef struct mi_memid_arena_info {
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // this arena can only be used for specific arena allocations
+} mi_memid_arena_info_t;
+
+typedef struct mi_memid_s {
+  union {
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
+  } mem;
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
+  mi_memkind_t  memkind;
+} mi_memid_t;
+
 
 // ------------------------------------------------------
 // Mimalloc pages contain allocated blocks
@@ -223,6 +248,10 @@ typedef union mi_page_flags_s {
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
 typedef uintptr_t mi_thread_free_t;
 
+// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
+typedef struct mi_subproc_s mi_subproc_t;
+
+
 // A page contains blocks of one specific size (`block_size`).
 // Each page has three list of free blocks:
 // `free` for blocks that can be allocated,
@@ -242,8 +271,6 @@ typedef uintptr_t mi_thread_free_t;
 // Notes:
 // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc`
 // - Using `uint16_t` does not seem to slow things down
-// - The size is 10 words on 64-bit which helps the page index calculations
-//   (and 12 words on 32-bit, and encoded free lists add 2 words)
 // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize
 //   concurrent frees where only the first concurrent free adds to the owning
 //   heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`).
@@ -252,15 +279,8 @@ typedef uintptr_t mi_thread_free_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  // "owned" by the segment
-  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
-  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
-  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
-  uint8_t               is_zero_init:1;    // `true` if the page was initially zero initialized
-  uint8_t               is_huge:1;         // `true` if the page is in a huge segment
-
-  // layout like this to optimize access in `mi_malloc` and `mi_free`
-  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  mi_memid_t            memid;             // provenance of the page memory
+  uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
   uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
@@ -272,120 +292,54 @@ typedef struct mi_page_s {
   uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the page area containing the blocks
+  size_t                block_size;        // size available in each block (always `>0`)  
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)        xheap;
+  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
+  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-
-  #if MI_INTPTR_SIZE==4                    // pad to 12 words on 32-bit
-  void* padding[1];
-  #endif
 } mi_page_t;
 
 
+// ------------------------------------------------------
+// Object sizes
+// ------------------------------------------------------
+
+#define MI_PAGE_ALIGN                     (64)
+#define MI_PAGE_INFO_SIZE                 (MI_SIZE_SHIFT*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // ~16KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // ~128KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // ~2MiB
+#define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
+
+
+#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
 
 // ------------------------------------------------------
-// Mimalloc segments contain mimalloc pages
+// Page kinds
 // ------------------------------------------------------
 
 typedef enum mi_page_kind_e {
-  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
-  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages inside a segment
-  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
-  MI_PAGE_HUGE      // a huge page is a single page in a segment of variable size (but still 2MiB aligned)
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages
+  MI_PAGE_MEDIUM,   // medium blocks go into 512KiB pages
+  MI_PAGE_LARGE,    // larger blocks go into 4MiB pages
+  MI_PAGE_SINGLETON // page containing a single block. 
                     // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`.
 } mi_page_kind_t;
 
 
-// ---------------------------------------------------------------
-// a memory id tracks the provenance of arena/OS allocated memory
-// ---------------------------------------------------------------
-
-// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this.
-typedef enum mi_memkind_e {
-  MI_MEM_NONE,      // not allocated
-  MI_MEM_EXTERNAL,  // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example)
-  MI_MEM_STATIC,    // allocated in a static area and should not be freed (for arena meta data for example)
-  MI_MEM_OS,        // allocated from the OS
-  MI_MEM_OS_HUGE,   // allocated as huge OS pages (usually 1GiB, pinned to physical memory)
-  MI_MEM_OS_REMAP,  // allocated in a remapable area (i.e. using `mremap`)
-  MI_MEM_ARENA      // allocated from an arena (the usual case)
-} mi_memkind_t;
-
-static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
-  return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP);
-}
-
-typedef struct mi_memid_os_info {
-  void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
-} mi_memid_os_info_t;
-
-typedef struct mi_memid_arena_info {
-  size_t        block_index;        // index in the arena
-  mi_arena_id_t id;                 // arena id (>= 1)
-  bool          is_exclusive;       // this arena can only be used for specific arena allocations
-} mi_memid_arena_info_t;
-
-typedef struct mi_memid_s {
-  union {
-    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
-  } mem;
-  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages)
-  bool          initially_committed;// `true` if the memory was originally allocated as committed
-  bool          initially_zero;     // `true` if the memory was originally zero initialized
-  mi_memkind_t  memkind;
-} mi_memid_t;
-
-
-// ---------------------------------------------------------------
-// Segments contain mimalloc pages
-// ---------------------------------------------------------------
-typedef struct mi_subproc_s mi_subproc_t;
-
-// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS.
-// Inside segments we allocated fixed size _pages_ that contain blocks.
-typedef struct mi_segment_s {
-  // constant fields
-  mi_memid_t           memid;            // memory id to track provenance
-  bool                 allow_decommit;
-  bool                 allow_purge;
-  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
-  mi_subproc_t*        subproc;          // segment belongs to sub process
-
-  // segment fields
-  struct mi_segment_s* next;             // must be the first (non-constant) segment field  -- see `segment.c:segment_init`
-  struct mi_segment_s* prev;
-  bool                 was_reclaimed;    // true if it was reclaimed (used to limit reclaim-on-free reclamation)
-  bool                 dont_free;        // can be temporarily true to ensure the segment is not freed
-
-  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t               abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long)
-
-  size_t               used;             // count of pages in use (`used <= capacity`)
-  size_t               capacity;         // count of available pages (`#free + used`)
-  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
-
-  struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled
-  struct mi_segment_s* abandoned_os_prev;
-
-  // layout like this to optimize access in `mi_free`
-  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
-  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  mi_page_kind_t       page_kind;        // kind of pages: small, medium, large, or huge
-  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
-} mi_segment_t;
-
 
 // ------------------------------------------------------
 // Heaps
@@ -522,21 +476,18 @@ typedef struct mi_stat_counter_s {
 } mi_stat_counter_t;
 
 typedef struct mi_stats_s {
-  mi_stat_count_t segments;
   mi_stat_count_t pages;
   mi_stat_count_t reserved;
   mi_stat_count_t committed;
   mi_stat_count_t reset;
   mi_stat_count_t purged;
   mi_stat_count_t page_committed;
-  mi_stat_count_t segments_abandoned;
   mi_stat_count_t pages_abandoned;
   mi_stat_count_t threads;
   mi_stat_count_t normal;
   mi_stat_count_t huge;
   mi_stat_count_t giant;
   mi_stat_count_t malloc;
-  mi_stat_count_t segments_cache;
   mi_stat_counter_t pages_extended;
   mi_stat_counter_t mmap_calls;
   mi_stat_counter_t commit_calls;
@@ -581,12 +532,12 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 // ------------------------------------------------------
 
 struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned segments for this sub-process
-  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned segments in the os-list
-  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations)
+  _Atomic(size_t)    abandoned_count;         // count of abandoned pages for this sub-process
+  _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
+  mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
   mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
-  mi_segment_t*      abandoned_os_list;       // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory)
-  mi_segment_t*      abandoned_os_list_tail;  // the tail-end of the list
+  mi_page_t*         abandoned_os_list;       // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory)
+  mi_page_t*         abandoned_os_list_tail;  // the tail-end of the list
   mi_memid_t         memid;                   // provenance of this memory block
 };
 
@@ -597,11 +548,6 @@ struct mi_subproc_s {
 // Milliseconds as in `int64_t` to avoid overflows
 typedef int64_t  mi_msecs_t;
 
-// Queue of segments
-typedef struct mi_segment_queue_s {
-  mi_segment_t* first;
-  mi_segment_t* last;
-} mi_segment_queue_t;
 
 // OS thread local data
 typedef struct mi_os_tld_s {
@@ -609,28 +555,13 @@ typedef struct mi_os_tld_s {
   mi_stats_t*           stats;        // points to tld stats
 } mi_os_tld_t;
 
-// Segments thread local data
-typedef struct mi_segments_tld_s {
-  mi_segment_queue_t  small_free;   // queue of segments with free small pages
-  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
-  mi_page_queue_t     pages_purge;  // queue of freed pages that are delay purged
-  size_t              count;        // current number of segments;
-  size_t              peak_count;   // peak number of segments
-  size_t              current_size; // current size of all segments
-  size_t              peak_size;    // peak size of all segments
-  size_t              reclaim_count;// number of reclaimed (abandoned) segments
-  mi_subproc_t*       subproc;      // sub-process this thread belongs to.
-  mi_stats_t*         stats;        // points to tld stats
-  mi_os_tld_t*        os;           // points to os tld
-} mi_segments_tld_t;
-
 // Thread local data
 struct mi_tld_s {
   unsigned long long  heartbeat;     // monotonic heartbeat count
   bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
-  mi_segments_tld_t   segments;      // segment tld
+  mi_subproc_t*       subproc;       // sub-process this thread belongs to.
   mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
 };
diff --git a/src/alloc.c b/src/alloc.c
index a093f108..00f6d1a4 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -82,7 +82,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_
 
   #if (MI_STAT>0)
   const size_t bsize = mi_page_usable_block_size(page);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_increase(heap, normal, bsize);
     mi_heap_stat_counter_increase(heap, normal_count, 1);
     #if (MI_STAT>1)
diff --git a/src/xarena.c b/src/arena-old.c
similarity index 53%
rename from src/xarena.c
rename to src/arena-old.c
index 42943f84..8ca5aaf3 100644
--- a/src/xarena.c
+++ b/src/arena-old.c
@@ -21,834 +21,46 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "xbitmap.h"
+#include "mimalloc/atomic.h"
+#include "bitmap.h"
 
 
 /* -----------------------------------------------------------
   Arena allocation
 ----------------------------------------------------------- */
 
-#define MI_ARENA_BLOCK_SIZE     (MI_SMALL_PAGE_SIZE)    // 64KiB
-#define MI_ARENA_BLOCK_ALIGN    (MI_ARENA_BLOCK_SIZE)   // 64KiB
-#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
-
-#define MI_ARENA_MIN_OBJ_SIZE   MI_ARENA_BLOCK_SIZE
-#define MI_ARENA_MAX_OBJ_SIZE   (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
-
 // A memory arena descriptor
 typedef struct mi_arena_s {
   mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
-  // _Atomic(uint8_t*)   start;                // the start of the memory area
-  // size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
-  // mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
+  _Atomic(uint8_t*)start;                // the start of the memory area
   size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
+  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-
-  mi_bitmap_t         blocks_free;          // is the block free?
-  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
-  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
-  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
-  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
-                                            // the full queue contains abandoned full pages
+  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
+  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
+  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
+  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
+  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
 } mi_arena_t;
 
-#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
+
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
-
-/* -----------------------------------------------------------
-  Arena id's
-  id = arena_index + 1
------------------------------------------------------------ */
-
-size_t mi_arena_id_index(mi_arena_id_t id) {
-  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
-}
-
-static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
-  mi_assert_internal(arena_index < MI_MAX_ARENAS);
-  return (int)arena_index + 1;
-}
-
-mi_arena_id_t _mi_arena_id_none(void) {
-  return 0;
-}
-
-static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
-  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
-          (arena_id == req_arena_id));
-}
-
-bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
-  if (memid.memkind == MI_MEM_ARENA) {
-    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
-  }
-  else {
-    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
-  }
-}
-
-size_t mi_arena_get_count(void) {
-  return mi_atomic_load_relaxed(&mi_arena_count);
-}
-
-mi_arena_t* mi_arena_from_index(size_t idx) {
-  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
-}
-
-
-
-/* -----------------------------------------------------------
-  Util
------------------------------------------------------------ */
-
-// Blocks needed for a given byte size
-static size_t mi_block_count_of_size(size_t size) {
-  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
-}
-
-// Byte size of a number of blocks
-static size_t mi_size_of_blocks(size_t bcount) {
-  return (bcount * MI_ARENA_BLOCK_SIZE);
-}
-
-// Size of an arena
-static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
-}
-
-static size_t mi_arena_info_blocks(void) {
-  const size_t os_page_size = _mi_os_page_size();
-  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
-  const size_t info_blocks  = mi_block_count_of_size(info_size);
-  return info_blocks;
-}
-
-
-// Start of the arena memory area
-static uint8_t* mi_arena_start(mi_arena_t* arena) {
-  return ((uint8_t*)arena);
-}
-
-// Start of a block
-void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
-  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
-}
-
-// Arena area
-void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
-  if (size != NULL) *size = 0;
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  if (arena_index >= MI_MAX_ARENAS) return NULL;
-  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
-  if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
-  return mi_arena_start(arena);
-}
-
-
-// Create an arena memid
-static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
-  memid.mem.arena.id = id;
-  memid.mem.arena.block_index = block_index;
-  memid.mem.arena.is_exclusive = is_exclusive;
-  return memid;
-}
-
-// returns if the arena is exclusive
-bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
-  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
-  *arena_index = mi_arena_id_index(memid.mem.arena.id);
-  *block_index = memid.mem.arena.block_index;
-  return memid.mem.arena.is_exclusive;
-}
-
-
-
-/* -----------------------------------------------------------
-  Arena Allocation
------------------------------------------------------------ */
-
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
-  size_t block_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
-
-  // claimed it!
-  void* p = mi_arena_block_start(arena, block_index);
-  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
-  memid->is_pinned = arena->memid.is_pinned;
-
-  // set the dirty bits
-  if (arena->memid.initially_zero) {
-    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
-  }
-
-  // set commit state
-  if (commit) {
-    // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->initially_committed = true;
-
-    bool all_already_committed;
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
-    if (!all_already_committed) {
-      bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
-        memid->initially_committed = false;
-      }
-      else {
-        if (commit_zero) { memid->initially_zero = true; }
-      }
-    }
-  }
-  else {
-    // no need to commit, but check if already fully committed
-    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
-  }
-
-  return p;
-}
-
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, 
-  size_t size, size_t alignment,
-  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
-
-// try to reserve a fresh arena space
-static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
-{
-  if (_mi_preloading()) return false;  // use OS only while pre loading
-  if (req_arena_id != _mi_arena_id_none()) return false;
-
-  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
-  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
-
-  // calc reserve
-  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
-  if (arena_reserve == 0) return false;
-
-  if (!_mi_os_has_virtual_reserve()) {
-    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
-  }  
-  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
-  
-  if (arena_count >= 8 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
-    size_t reserve = 0;
-    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
-      arena_reserve = reserve;
-    }
-  }
-
-  // check arena bounds
-  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
-  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
-  if (arena_reserve < min_reserve) {
-    arena_reserve = min_reserve;
-  }
-  else if (arena_reserve > max_reserve) {
-    arena_reserve = max_reserve;
-  }
-
-  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-
-  // commit eagerly?
-  bool arena_commit = false;
-  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
-  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
-
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
-}
-
-
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  size_t tseq = _mi_thread_seq_id();
-  *memid = _mi_memid_none();
-
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
-
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
-    }
-  }
-
-  // if we cannot use OS allocation, return NULL
-  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
-    errno = ENOMEM;
-    return NULL;
-  }
-
-  // finally, fall back to the OS
-  if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
-  }
-  else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }
-}
-
-void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
-}
-
-
-/* -----------------------------------------------------------
-  Arena free
------------------------------------------------------------ */
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
-
-void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
-  mi_assert_internal(committed_size <= size);
-  if (p==NULL) return;
-  if (size==0) return;
-  const bool all_committed = (committed_size == size);
-
-  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
-  mi_track_mem_undefined(p, size);
-
-  if (mi_memkind_is_os(memid.memkind)) {
-    // was a direct OS allocation, pass through
-    if (!all_committed && committed_size > 0) {
-      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
-      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-    }
-    _mi_os_free(p, size, memid, stats);
-  }
-  else if (memid.memkind == MI_MEM_ARENA) {
-    // allocated in an arena
-    size_t arena_idx;
-    size_t block_idx;
-    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
-    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
-    mi_assert_internal(arena != NULL);
-    const size_t blocks = mi_block_count_of_size(size);
-
-    // checks
-    if (arena == NULL) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-    mi_assert_internal(block_idx < arena->block_count);
-    mi_assert_internal(block_idx > mi_arena_info_blocks());
-    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
-      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
-      return;
-    }
-
-    // potentially decommit
-    if (arena->memid.is_pinned || arena->memid.initially_committed) {
-      mi_assert_internal(all_committed);
-    }
-    else {
-      if (!all_committed) {
-        // mark the entire range as no longer committed (so we recommit the full range when re-using)
-        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
-        mi_track_mem_noaccess(p, size);
-        if (committed_size > 0) {
-          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
-          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
-          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
-        }
-        // note: if not all committed, it may be that the purge will reset/decommit the entire range
-        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
-        // works (as we should never reset decommitted parts).
-      }
-      // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
-    }
-
-    // and make it available to others again
-    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
-    if (!all_inuse) {
-      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
-      return;
-    };
-  }
-  else {
-    // arena was none, external, or static; nothing to do
-    mi_assert_internal(memid.memkind < MI_MEM_OS);
-  }
-
-  // purge expired decommits
-  mi_arenas_try_purge(false, false, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t new_max_arena = 0;
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL) {
-      mi_lock_done(&arena->abandoned_visit_lock);
-      if (mi_memkind_is_os(arena->memid.memkind)) {
-        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
-      }      
-    }
-  }
-
-  // try to lower the max arena.
-  size_t expected = max_arena;
-  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
-}
-
-// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
-void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
-  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
-}
-
-// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
-// for dynamic libraries that are unloaded and need to release all their allocated memory.
-void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
-  mi_arenas_unsafe_destroy();
-  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
-}
-
-// Is a pointer inside any of our arenas?
-bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  for (size_t i = 0; i < max_arena; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-/* -----------------------------------------------------------
-  Add an arena.
------------------------------------------------------------ */
-
-static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
-  mi_assert_internal(arena != NULL);
-  mi_assert_internal(arena->block_count > 0);
-  if (arena_id != NULL) { *arena_id = -1; }
-
-  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
-  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement_acq_rel(&mi_arena_count);
-    return false;
-  }
-  _mi_stat_counter_increase(&stats->arena_count,1);
-  arena->id = mi_arena_id_create(i);
-  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
-  if (arena_id != NULL) { *arena_id = arena->id; }
-  return true;
-}
-
-static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
-{
-  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
-  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
-  mi_assert(start!=NULL);
-  if (start==NULL) return false;
-  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
-    // todo: use alignment in memid to align to blocksize first?
-    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
-    return false;
-  }
-
-  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
-
-  const size_t info_blocks = mi_arena_info_blocks();
-  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
-  if (bcount < info_blocks+1) {
-    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
-    return false;
-  }
-  if (bcount > MI_BITMAP_MAX_BITS) {
-    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
-    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
-    return false;
-  }
-  mi_arena_t* arena = (mi_arena_t*)start;
-
-  // commit & zero if needed
-  bool is_zero = memid.initially_zero;
-  if (!memid.initially_committed) {
-    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
-  }
-  if (!is_zero) {
-    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
-  }
-
-  // init
-  arena->id           = _mi_arena_id_none();
-  arena->memid        = memid;
-  arena->exclusive    = exclusive;
-  arena->block_count  = bcount;
-  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large     = is_large;
-  arena->purge_expire = 0;
-  mi_lock_init(&arena->abandoned_visit_lock);
-
-  // init bitmaps
-  mi_bitmap_init(&arena->blocks_free,true);
-  mi_bitmap_init(&arena->blocks_committed,true);
-  mi_bitmap_init(&arena->blocks_dirty,true);
-  mi_bitmap_init(&arena->blocks_purge,true);
-  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
-    mi_bitmap_init(&arena->blocks_abandoned[i],true);
-  }
-
-  // reserve our meta info (and reserve blocks outside the memory area)
-  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
-  if (memid.initially_committed) {
-    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
-  }
-  else {
-    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
-  }
-  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
-
-  return mi_arena_add(arena, arena_id, &_mi_stats_main);
-}
-
-
-bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
-  memid.initially_committed = is_committed;
-  memid.initially_zero = is_zero;
-  memid.is_pinned = is_large;
-  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
-  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
-  mi_memid_t memid;
-  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
-  if (start == NULL) return ENOMEM;
-  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
-  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
-    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
-    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
-    return ENOMEM;
-  }
-  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
-  return 0;
-}
-
-
-// Manage a range of regular OS memory
-bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
-  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
-}
-
-// Reserve a range of regular OS memory
-int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
-  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
-}
-
-
-/* -----------------------------------------------------------
-  Debugging
------------------------------------------------------------ */
-static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
-  size_t bit_set_count = 0;
-  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {   
-    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
-    if (is_set) bit_set_count++;
-    buf[bit] = (is_set ? 'x' : '.');
-  }
-  return bit_set_count;
-}
-
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
-  _mi_verbose_message("%s%s:\n", prefix, header);
-  size_t bit_count = 0;
-  size_t bit_set_count = 0;
-  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
-    char buf[MI_BITMAP_CHUNK_BITS + 1];
-    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
-    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
-      if (bit_count < block_count) {
-        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
-      }
-      else {
-        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
-      }
-      bit_count += MI_BFIELD_BITS;
-    }
-    buf[MI_BITMAP_CHUNK_BITS] = 0;
-    _mi_verbose_message("%s  %s\n", prefix, buf);
-  }
-  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
-  return bit_set_count;
-}
-
-void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
-  MI_UNUSED(show_abandoned);
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
-  size_t free_total = 0;
-  size_t block_total = 0;
-  //size_t abandoned_total = 0;
-  size_t purge_total = 0;
-  for (size_t i = 0; i < max_arenas; i++) {
-    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena == NULL) break;
-    block_total += arena->block_count;
-    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
-    if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
-    }
-    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);    
-    // todo: abandoned blocks
-    if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
-    }
-  }
-  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
-  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
-  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
-}
-
-
-/* -----------------------------------------------------------
-  Reserve a huge page arena.
------------------------------------------------------------ */
-// reserve at a specific numa node
-int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
-  if (arena_id != NULL) *arena_id = -1;
-  if (pages==0) return 0;
-  if (numa_node < -1) numa_node = -1;
-  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
-  size_t hsize = 0;
-  size_t pages_reserved = 0;
-  mi_memid_t memid;
-  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
-  if (p==NULL || pages_reserved==0) {
-    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
-    return ENOMEM;
-  }
-  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
-
-  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
-    _mi_os_free(p, hsize, memid, &_mi_stats_main);
-    return ENOMEM;
-  }
-  return 0;
-}
-
-int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
-  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
-}
-
-// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
-int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
-  if (pages == 0) return 0;
-
-  // pages per numa node
-  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
-  if (numa_count <= 0) numa_count = 1;
-  const size_t pages_per = pages / numa_count;
-  const size_t pages_mod = pages % numa_count;
-  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
-
-  // reserve evenly among numa nodes
-  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
-    size_t node_pages = pages_per;  // can be 0
-    if (numa_node < pages_mod) node_pages++;
-    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
-    if (err) return err;
-    if (pages < node_pages) {
-      pages = 0;
-    }
-    else {
-      pages -= node_pages;
-    }
-  }
-
-  return 0;
-}
-
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  MI_UNUSED(max_secs);
-  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
-  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
-  return err;
-}
-
-
-
-/* -----------------------------------------------------------
-  Arena purge
------------------------------------------------------------ */
-
-static long mi_arena_purge_delay(void) {
-  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
-  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
-}
-
-// reset or decommit in an arena and update the committed/decommit bitmaps
-// assumes we own the area (i.e. blocks_free is claimed by us)
-static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {  
-  mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
-  void* const p = mi_arena_block_start(arena, block_idx);
-  bool needs_recommit;
-  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
-    // all blocks are committed, we can purge freely
-    needs_recommit = _mi_os_purge(p, size, stats);
-  }
-  else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed
-    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
-    // and also undo the decommit stats (as it was already adjusted)
-    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
-    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
-    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
-  }
-
-  // clear the purged blocks
-  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
-
-  // update committed bitmap
-  if (needs_recommit) {
-    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
-  }
-}
-
-
-// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
-// Note: assumes we (still) own the area as we may purge immediately
-static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
-  const long delay = mi_arena_purge_delay();
-  if (delay < 0) return;  // is purging allowed at all?
-
-  if (_mi_preloading() || delay == 0) {
-    // decommit directly
-    mi_arena_purge(arena, block_idx, blocks, stats);
-  }
-  else {
-    // schedule decommit
-    _mi_error_message(EFAULT, "purging not yet implemented\n");
-  }
-}
-
-
-static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
-  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
-
-  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
-  if (max_arena == 0) return;
-
-  _mi_error_message(EFAULT, "purging not yet implemented\n");
-  MI_UNUSED(stats);
-  MI_UNUSED(visit_all);
-  MI_UNUSED(force);
-}
-
-
-#if 0
-
 #define MI_IN_ARENA_C
 #include "arena-abandon.c"
 #undef MI_IN_ARENA_C
@@ -904,12 +116,12 @@ static size_t mi_block_count_of_size(size_t size) {
   return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
-static size_t mi_size_of_blocks(size_t bcount) {
+static size_t mi_arena_block_size(size_t bcount) {
   return (bcount * MI_ARENA_BLOCK_SIZE);
 }
 
 static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_size_of_blocks(arena->block_count);
+  return mi_arena_block_size(arena->block_count);
 }
 
 static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
@@ -995,7 +207,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
 }
 
 void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
+  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
 }
 
 
@@ -1004,7 +216,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
@@ -1056,7 +268,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
         memid->initially_committed = false;
       }
       else {
@@ -1081,7 +293,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
   const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+  mi_assert_internal(size <= mi_arena_block_size(bcount));
 
   // Check arena suitability
   mi_arena_t* arena = mi_arena_from_index(arena_index);
@@ -1227,7 +439,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (arena_index >= MI_MAX_ARENAS) return NULL;
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
   return arena->start;
 }
 
@@ -1247,7 +459,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_committed != NULL);
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_size_of_blocks(blocks);
+  const size_t size = mi_arena_block_size(blocks);
   void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
@@ -1299,25 +511,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 // purge a range of blocks
 // return true if the full range was purged.
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startseqx + bitlen;
-  size_t bitseqx = startseqx;
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startidx + bitlen;
+  size_t bitidx = startidx;
   bool all_purged = false;
-  while (bitseqx < endidx) {
+  while (bitidx < endidx) {
     // count consecutive ones in the purge mask
     size_t count = 0;
-    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
+    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
       count++;
     }
     if (count > 0) {
       // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
       mi_arena_purge(arena, range_idx, count, stats);
       if (count == bitlen) {
         all_purged = true;
       }
     }
-    bitseqx += (count+1); // +1 to skip the zero bit (or end)
+    bitidx += (count+1); // +1 to skip the zero bit (or end)
   }
   return all_purged;
 }
@@ -1339,16 +551,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
-      size_t bitseqx = 0;
-      while (bitseqx < MI_BITMAP_FIELD_BITS) {
+      size_t bitidx = 0;
+      while (bitidx < MI_BITMAP_FIELD_BITS) {
         // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
-        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
+        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
           bitlen++;
         }
         // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
         while( bitlen > 0 ) {
           if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
             break;
@@ -1359,15 +571,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         if (bitlen > 0) {
           // read purge again now that we have the in_use bits
           purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
+          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
             full_purge = false;
           }
           any_purged = true;
           // release the claimed `in_use` bits again
           _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
         }
-        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitseqx
+        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitidx
     } // purge != 0
   }
   // if not fully purged, make sure to purge again in the future
@@ -1530,7 +742,7 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
   }
@@ -1606,8 +818,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_assert_internal(post >= 0);
   if (post > 0) {
     // don't use leftover bits at the end
-    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
   }
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
@@ -1774,4 +986,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 }
 
 
-#endif
\ No newline at end of file
diff --git a/src/arena.c b/src/arena.c
index 8ca5aaf3..28ad61f1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -21,7 +21,6 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "bitmap.h"
 
 
@@ -29,38 +28,823 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
   Arena allocation
 ----------------------------------------------------------- */
 
+#define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
+
+
 // A memory arena descriptor
 typedef struct mi_arena_s {
-  mi_arena_id_t       id;                   // arena id; 0 for non-specific
   mi_memid_t          memid;                // memid of the memory area
-  _Atomic(uint8_t*)start;                // the start of the memory area
+  mi_arena_id_t       id;                   // arena id; 0 for non-specific
+
   size_t              block_count;          // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
-  size_t              field_count;          // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
-  size_t              meta_size;            // size of the arena structure itself (including its bitmaps)
-  mi_memid_t          meta_memid;           // memid of the arena structure itself (OS or static allocation)
   int                 numa_node;            // associated NUMA node
   bool                exclusive;            // only allow allocations if specifically for this arena
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
-  _Atomic(size_t)search_idx;           // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t)purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
-  mi_bitmap_field_t* blocks_dirty;         // are the blocks potentially non-zero?
-  mi_bitmap_field_t* blocks_committed;     // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;         // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
-  mi_bitmap_field_t* blocks_abandoned;     // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here)
-  mi_bitmap_field_t   blocks_inuse[1];      // in-place bitmap of in-use blocks (of size `field_count`)
-  // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields.
+  _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+
+  mi_bitmap_t         blocks_free;          // is the block free?
+  mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
+  mi_bitmap_t         blocks_purge;         // can the block be purged? (block in purge => block in free)
+  mi_bitmap_t         blocks_dirty;         // is the block potentially non-zero?
+  mi_bitmap_t         blocks_abandoned[MI_BIN_COUNT];  // abandoned pages per size bin (a set bit means the start of the page)
+                                            // the full queue contains abandoned full pages
 } mi_arena_t;
 
-
-#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
-#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
-#define MI_MAX_ARENAS         (132)                    // Limited as the reservation exponentially increases (and takes up .bss)
+#define MI_MAX_ARENAS         (1024)        // Limited for now (and takes up .bss)
 
 // The available arenas
 static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
 static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
 
+
+/* -----------------------------------------------------------
+  Arena id's
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  return (int)arena_index + 1;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+          (arena_id == req_arena_id));
+}
+
+bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) {
+  if (memid.memkind == MI_MEM_ARENA) {
+    return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
+  }
+  else {
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
+  }
+}
+
+size_t mi_arena_get_count(void) {
+  return mi_atomic_load_relaxed(&mi_arena_count);
+}
+
+mi_arena_t* mi_arena_from_index(size_t idx) {
+  mi_assert_internal(idx < mi_arena_get_count());
+  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+}
+
+
+
+/* -----------------------------------------------------------
+  Util
+----------------------------------------------------------- */
+
+
+// Size of an arena
+static size_t mi_arena_size(mi_arena_t* arena) {
+  return mi_size_of_blocks(arena->block_count);
+}
+
+static size_t mi_arena_info_blocks(void) {
+  const size_t os_page_size = _mi_os_page_size();
+  const size_t info_size    = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page
+  const size_t info_blocks  = mi_block_count_of_size(info_size);
+  return info_blocks;
+}
+
+
+// Start of the arena memory area
+static uint8_t* mi_arena_start(mi_arena_t* arena) {
+  return ((uint8_t*)arena);
+}
+
+// Start of a block
+void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) {
+  return (mi_arena_start(arena) + mi_size_of_blocks(block_index));
+}
+
+// Arena area
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
+  return mi_arena_start(arena);
+}
+
+
+// Create an arena memid
+static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA);
+  memid.mem.arena.id = id;
+  memid.mem.arena.block_index = block_index;
+  memid.mem.arena.is_exclusive = is_exclusive;
+  return memid;
+}
+
+// returns if the arena is exclusive
+bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) {
+  mi_assert_internal(memid.memkind == MI_MEM_ARENA);
+  *arena_index = mi_arena_id_index(memid.mem.arena.id);
+  *block_index = memid.mem.arena.block_index;
+  return memid.mem.arena.is_exclusive;
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+
+  size_t block_index;
+  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
+
+  // claimed it!
+  void* p = mi_arena_block_start(arena, block_index);
+  *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index);
+  memid->is_pinned = arena->memid.is_pinned;
+
+  // set the dirty bits
+  if (arena->memid.initially_zero) {
+    memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL);
+  }
+
+  // set commit state
+  if (commit) {
+    // commit requested, but the range may not be committed as a whole: ensure it is committed now
+    memid->initially_committed = true;
+
+    bool all_already_committed;
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
+    if (!all_already_committed) {
+      bool commit_zero = false;
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+        memid->initially_committed = false;
+      }
+      else {
+        if (commit_zero) { memid->initially_zero = true; }
+      }
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
+  }
+
+  return p;
+}
+
+// allocate in a speficic arena
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node,
+  size_t size, size_t alignment,
+  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t arena_index = mi_arena_id_index(arena_id);
+  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
+
+  // Check arena suitability
+  mi_arena_t* arena = mi_arena_from_index(arena_index);
+  if (arena == NULL) return NULL;
+  if (!allow_large && arena->is_large) return NULL;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (match_numa_node) { if (!numa_suitable) return NULL; }
+    else { if (numa_suitable) return NULL; }
+  }
+
+  // try to allocate
+  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
+  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
+  return p;
+}
+
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
+
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  if mi_likely(max_arena == 0) return NULL;
+
+  if (req_arena_id != _mi_arena_id_none()) {
+    // try a specific arena if requested
+    if (mi_arena_id_index(req_arena_id) < max_arena) {
+      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+    }
+
+    // try from another numa node instead..
+    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
+      for (size_t i = 0; i < max_arena; i++) {
+        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+        if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+// try to reserve a fresh arena space
+static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
+{
+  if (_mi_preloading()) return false;  // use OS only while pre loading
+  if (req_arena_id != _mi_arena_id_none()) return false;
+
+  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
+  if (arena_count > (MI_MAX_ARENAS - 4)) return false;
+
+  // calc reserve
+  size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
+  if (arena_reserve == 0) return false;
+
+  if (!_mi_os_has_virtual_reserve()) {
+    arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for WASM for example)
+  }
+  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
+
+  if (arena_count >= 8 && arena_count <= 128) {
+    // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB)
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    size_t reserve = 0;
+    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
+      arena_reserve = reserve;
+    }
+  }
+
+  // check arena bounds
+  const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1);
+  const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE;
+  if (arena_reserve < min_reserve) {
+    arena_reserve = min_reserve;
+  }
+  else if (arena_reserve > max_reserve) {
+    arena_reserve = max_reserve;
+  }
+
+  if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
+
+  // commit eagerly?
+  bool arena_commit = false;
+  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
+  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
+
+  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+}
+
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  size_t tseq = _mi_thread_seq_id();
+  *memid = _mi_memid_none();
+
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
+    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
+      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+      if (p != NULL) return p;
+
+      // otherwise, try to first eagerly reserve a new arena
+      if (req_arena_id == _mi_arena_id_none()) {
+        mi_arena_id_t arena_id = 0;
+        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
+          // and try allocate in there
+          mi_assert_internal(req_arena_id == _mi_arena_id_none());
+          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+          if (p != NULL) return p;
+        }
+      }
+    }
+  }
+
+  // if we cannot use OS allocation, return NULL
+  if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+
+  // finally, fall back to the OS
+  if (align_offset > 0) {
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+  }
+  else {
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+  }
+}
+
+void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
+}
+
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats);
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats);
+
+void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  mi_assert_internal(committed_size <= size);
+  if (p==NULL) return;
+  if (size==0) return;
+  const bool all_committed = (committed_size == size);
+
+  // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.)
+  mi_track_mem_undefined(p, size);
+
+  if (mi_memkind_is_os(memid.memkind)) {
+    // was a direct OS allocation, pass through
+    if (!all_committed && committed_size > 0) {
+      // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size)
+      _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+    }
+    _mi_os_free(p, size, memid, stats);
+  }
+  else if (memid.memkind == MI_MEM_ARENA) {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t block_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &block_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(block_idx < arena->block_count);
+    mi_assert_internal(block_idx > mi_arena_info_blocks());
+    if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) {
+      _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+
+    // potentially decommit
+    if (arena->memid.is_pinned || arena->memid.initially_committed) {
+      mi_assert_internal(all_committed);
+    }
+    else {
+      if (!all_committed) {
+        // mark the entire range as no longer committed (so we recommit the full range when re-using)
+        mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+        mi_track_mem_noaccess(p, size);
+        if (committed_size > 0) {
+          // if partially committed, adjust the committed stats (is it will be recommitted when re-using)
+          // in the delayed purge, we now need to not count a decommit if the range is not marked as committed.
+          _mi_stat_decrease(&_mi_stats_main.committed, committed_size);
+        }
+        // note: if not all committed, it may be that the purge will reset/decommit the entire range
+        // that contains already decommitted parts. Since purge consistently uses reset or decommit that
+        // works (as we should never reset decommitted parts).
+      }
+      // (delay) purge the entire range
+      mi_arena_schedule_purge(arena, block_idx, blocks, stats);
+    }
+
+    // and make it available to others again
+    bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+  else {
+    // arena was none, external, or static; nothing to do
+    mi_assert_internal(memid.memkind < MI_MEM_OS);
+  }
+
+  // purge expired decommits
+  mi_arenas_try_purge(false, false, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+static void mi_arenas_unsafe_destroy(void) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t new_max_arena = 0;
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL) {
+      mi_lock_done(&arena->abandoned_visit_lock);
+      if (mi_memkind_is_os(arena->memid.memkind)) {
+        mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
+        _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main);
+      }
+    }
+  }
+
+  // try to lower the max arena.
+  size_t expected = max_arena;
+  mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena);
+}
+
+// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired
+void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) {
+  mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats);
+}
+
+// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
+// for dynamic libraries that are unloaded and need to release all their allocated memory.
+void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
+  mi_arenas_unsafe_destroy();
+  _mi_arenas_collect(true /* force purge */, stats);  // purge non-owned arenas
+}
+
+// Is a pointer inside any of our arenas?
+bool _mi_arena_contains(const void* p) {
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arena; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) { *arena_id = -1; }
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  _mi_stat_counter_increase(&stats->arena_count,1);
+  arena->id = mi_arena_id_create(i);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  if (arena_id != NULL) { *arena_id = arena->id; }
+  return true;
+}
+
+static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  mi_assert(!is_large || memid.initially_committed && memid.is_pinned);
+  mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE));
+  mi_assert(start!=NULL);
+  if (start==NULL) return false;
+  if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) {
+    // todo: use alignment in memid to align to blocksize first?
+    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start);
+    return false;
+  }
+
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
+
+  const size_t info_blocks = mi_arena_info_blocks();
+  const size_t bcount      = size / MI_ARENA_BLOCK_SIZE;  // divide down
+  if (bcount < info_blocks+1) {
+    _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB);
+    return false;
+  }
+  if (bcount > MI_BITMAP_MAX_BITS) {
+    // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
+    _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB);
+    return false;
+  }
+  mi_arena_t* arena = (mi_arena_t*)start;
+
+  // commit & zero if needed
+  bool is_zero = memid.initially_zero;
+  if (!memid.initially_committed) {
+    _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main);
+  }
+  if (!is_zero) {
+    _mi_memzero(arena, mi_size_of_blocks(info_blocks));
+  }
+
+  // init
+  arena->id           = _mi_arena_id_none();
+  arena->memid        = memid;
+  arena->exclusive    = exclusive;
+  arena->block_count  = bcount;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->purge_expire = 0;
+  mi_lock_init(&arena->abandoned_visit_lock);
+
+  // init bitmaps
+  mi_bitmap_init(&arena->blocks_free,true);
+  mi_bitmap_init(&arena->blocks_committed,true);
+  mi_bitmap_init(&arena->blocks_dirty,true);
+  mi_bitmap_init(&arena->blocks_purge,true);
+  for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) {
+    mi_bitmap_init(&arena->blocks_abandoned[i],true);
+  }
+
+  // reserve our meta info (and reserve blocks outside the memory area)
+  mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks);
+  if (memid.initially_committed) {
+    mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count);
+  }
+  else {
+    mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL);
+  }
+  mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL);
+
+  return mi_arena_add(arena, arena_id, &_mi_stats_main);
+}
+
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
+  memid.is_pinned = is_large;
+  return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  mi_memid_t memid;
+  void* start = _mi_os_alloc_aligned(size, MI_ARENA_BLOCK_ALIGN, commit, allow_large, &memid, &_mi_stats_main);
+  if (start == NULL) return ENOMEM;
+  const bool is_large = memid.is_pinned; // todo: use separate is_large field?
+  if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) {
+    _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : "");
+  return 0;
+}
+
+
+// Manage a range of regular OS memory
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL);
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
+  size_t bit_set_count = 0;
+  for (int bit = 0; bit < MI_BFIELD_BITS; bit++) {
+    bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0);
+    if (is_set) bit_set_count++;
+    buf[bit] = (is_set ? 'x' : '.');
+  }
+  return bit_set_count;
+}
+
+static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) {
+  _mi_verbose_message("%s%s:\n", prefix, header);
+  size_t bit_count = 0;
+  size_t bit_set_count = 0;
+  for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) {
+    char buf[MI_BITMAP_CHUNK_BITS + 1];
+    mi_bitmap_chunk_t* chunk = &bitmap->chunks[i];
+    for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) {
+      if (bit_count < block_count) {
+        bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS);
+      }
+      else {
+        _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS);
+      }
+      bit_count += MI_BFIELD_BITS;
+    }
+    buf[MI_BITMAP_CHUNK_BITS] = 0;
+    _mi_verbose_message("%s  %s\n", prefix, buf);
+  }
+  _mi_verbose_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  return bit_set_count;
+}
+
+void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
+  MI_UNUSED(show_abandoned);
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t free_total = 0;
+  size_t block_total = 0;
+  //size_t abandoned_total = 0;
+  size_t purge_total = 0;
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    block_total += arena->block_count;
+    _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : ""));
+    if (show_inuse) {
+      free_total += mi_debug_show_bitmap("  ", "free blocks", arena->block_count, &arena->blocks_free);
+    }
+    mi_debug_show_bitmap("  ", "committed blocks", arena->block_count, &arena->blocks_committed);
+    // todo: abandoned blocks
+    if (show_purge) {
+      purge_total += mi_debug_show_bitmap("  ", "purgeable blocks", arena->block_count, &arena->blocks_purge);
+    }
+  }
+  if (show_inuse)     _mi_verbose_message("total inuse blocks    : %zu\n", block_total - free_total);
+  // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total);
+  if (show_purge)     _mi_verbose_message("total purgeable blocks: %zu\n", purge_total);
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  mi_memid_t memid;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) {
+    _mi_os_free(p, hsize, memid, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
+
+
+
+/* -----------------------------------------------------------
+  Abandoned pages
+----------------------------------------------------------- */
+
+void mi_arena_page_abandon(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_abandoned(page));
+  if (mi_page_is_full(page)) {}
+}
+
+
+
+/* -----------------------------------------------------------
+  Arena purge
+----------------------------------------------------------- */
+
+static long mi_arena_purge_delay(void) {
+  // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay
+  return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult));
+}
+
+// reset or decommit in an arena and update the committed/decommit bitmaps
+// assumes we own the area (i.e. blocks_free is claimed by us)
+static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+  mi_assert_internal(!arena->memid.is_pinned);
+  const size_t size = mi_size_of_blocks(blocks);
+  void* const p = mi_arena_block_start(arena, block_idx);
+  bool needs_recommit;
+  if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) {
+    // all blocks are committed, we can purge freely
+    needs_recommit = _mi_os_purge(p, size, stats);
+  }
+  else {
+    // some blocks are not committed -- this can happen when a partially committed block is freed
+    // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
+    // and also undo the decommit stats (as it was already adjusted)
+    mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
+    needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
+    if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); }
+  }
+
+  // clear the purged blocks
+  mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL);
+
+  // update committed bitmap
+  if (needs_recommit) {
+    mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL);
+  }
+}
+
+
+// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls.
+// Note: assumes we (still) own the area as we may purge immediately
+static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) {
+  const long delay = mi_arena_purge_delay();
+  if (delay < 0) return;  // is purging allowed at all?
+
+  if (_mi_preloading() || delay == 0) {
+    // decommit directly
+    mi_arena_purge(arena, block_idx, blocks, stats);
+  }
+  else {
+    // schedule decommit
+    _mi_error_message(EFAULT, "purging not yet implemented\n");
+  }
+}
+
+
+static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) {
+  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
+
+  const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count);
+  if (max_arena == 0) return;
+
+  _mi_error_message(EFAULT, "purging not yet implemented\n");
+  MI_UNUSED(stats);
+  MI_UNUSED(visit_all);
+  MI_UNUSED(force);
+}
+
+
+#if 0
+
 #define MI_IN_ARENA_C
 #include "arena-abandon.c"
 #undef MI_IN_ARENA_C
@@ -116,12 +900,12 @@ static size_t mi_block_count_of_size(size_t size) {
   return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
 }
 
-static size_t mi_arena_block_size(size_t bcount) {
+static size_t mi_size_of_blocks(size_t bcount) {
   return (bcount * MI_ARENA_BLOCK_SIZE);
 }
 
 static size_t mi_arena_size(mi_arena_t* arena) {
-  return mi_arena_block_size(arena->block_count);
+  return mi_size_of_blocks(arena->block_count);
 }
 
 static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) {
@@ -207,7 +991,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) {
 }
 
 void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
-  return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex)));
+  return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex)));
 }
 
 
@@ -216,7 +1000,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) {
 ----------------------------------------------------------- */
 
 // claim the `blocks_inuse` bits
-static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats)
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) {
@@ -268,7 +1052,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
     if (any_uncommitted) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
         memid->initially_committed = false;
       }
       else {
@@ -293,7 +1077,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
   const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_arena_block_size(bcount));
+  mi_assert_internal(size <= mi_size_of_blocks(bcount));
 
   // Check arena suitability
   mi_arena_t* arena = mi_arena_from_index(arena_index);
@@ -439,7 +1223,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
   if (arena_index >= MI_MAX_ARENAS) return NULL;
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
-  if (size != NULL) { *size = mi_arena_block_size(arena->block_count); }
+  if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); }
   return arena->start;
 }
 
@@ -459,7 +1243,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_committed != NULL);
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
-  const size_t size = mi_arena_block_size(blocks);
+  const size_t size = mi_size_of_blocks(blocks);
   void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
@@ -511,25 +1295,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 // purge a range of blocks
 // return true if the full range was purged.
 // assumes we own the area (i.e. blocks_in_use is claimed by us)
-static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) {
-  const size_t endidx = startidx + bitlen;
-  size_t bitidx = startidx;
+static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) {
+  const size_t endidx = startseqx + bitlen;
+  size_t bitseqx = startseqx;
   bool all_purged = false;
-  while (bitidx < endidx) {
+  while (bitseqx < endidx) {
     // count consecutive ones in the purge mask
     size_t count = 0;
-    while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) {
+    while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) {
       count++;
     }
     if (count > 0) {
       // found range to be purged
-      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx);
+      const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx);
       mi_arena_purge(arena, range_idx, count, stats);
       if (count == bitlen) {
         all_purged = true;
       }
     }
-    bitidx += (count+1); // +1 to skip the zero bit (or end)
+    bitseqx += (count+1); // +1 to skip the zero bit (or end)
   }
   return all_purged;
 }
@@ -551,16 +1335,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
-      size_t bitidx = 0;
-      while (bitidx < MI_BITMAP_FIELD_BITS) {
+      size_t bitseqx = 0;
+      while (bitseqx < MI_BITMAP_FIELD_BITS) {
         // find consecutive range of ones in the purge mask
         size_t bitlen = 0;
-        while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) {
+        while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) {
           bitlen++;
         }
         // temporarily claim the purge range as "in-use" to be thread-safe with allocation
         // try to claim the longest range of corresponding in_use bits
-        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx);
+        const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx);
         while( bitlen > 0 ) {
           if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) {
             break;
@@ -571,15 +1355,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
         if (bitlen > 0) {
           // read purge again now that we have the in_use bits
           purge = mi_atomic_load_acquire(&arena->blocks_purge[i]);
-          if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) {
+          if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) {
             full_purge = false;
           }
           any_purged = true;
           // release the claimed `in_use` bits again
           _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index);
         }
-        bitidx += (bitlen+1);  // +1 to skip the zero (or end)
-      } // while bitidx
+        bitseqx += (bitlen+1);  // +1 to skip the zero (or end)
+      } // while bitseqx
     } // purge != 0
   }
   // if not fully purged, make sure to purge again in the future
@@ -742,7 +1526,7 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) {
       return true;
     }
   }
@@ -818,8 +1602,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_assert_internal(post >= 0);
   if (post > 0) {
     // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+    mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL);
   }
   return mi_arena_add(arena, arena_id, &_mi_stats_main);
 
@@ -986,3 +1770,4 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv
 }
 
 
+#endif
\ No newline at end of file
diff --git a/src/bitmap-old.c b/src/bitmap-old.c
new file mode 100644
index 00000000..3e6311dc
--- /dev/null
+++ b/src/bitmap-old.c
@@ -0,0 +1,419 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represented as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/bits.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((size_t)1 << count) - 1) << bitidx);
+}
+
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#if MI_HAS_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const size_t mapm = (map & m);
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const size_t newmap = (map | m);
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#if MI_HAS_FAST_BITSCAN
+      mi_assert_internal(mapm != 0);
+      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
+  return ((field & mask) == mask);
+}
+
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
+  do  {
+    if ((expected & mask) != 0) return false;
+  }
+  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
+  mi_assert_internal((expected & mask) == 0);
+  return true;
+}
+
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+
+  // check initial trailing zeros
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  size_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;  // some part is already claimed
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
+  // now try to claim the range atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
+  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
+
+  // initial field
+  size_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = (map | initial_mask);
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = (map | final_mask);
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  mi_stat_counter_increase(stats->arena_crossover_count,1);
+  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
+  return true;
+
+rollback:
+  // roll back intermediate fields
+  // (we just failed to claim `field` so decrement first)
+  while (--field > initial_field) {
+    newmap = 0;
+    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = (map & ~initial_mask);
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }
+  mi_stat_counter_increase(stats->arena_rollback_count,1);
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries <= 2) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
+  mi_assert_internal(count > 0);
+  if (count <= 2) {
+    // we don't bother with crossover fields for small counts
+    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  }
+
+  // visit the fields
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) { idx = 0; } // wrap
+    // first try to claim inside a field
+    /*
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+        return true;
+      }
+    }
+    */
+    // if that fails, then try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_one = true;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) { *pany_zero = any_zero; }
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1.
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & mid_mask) != mid_mask) all_ones = false;
+    if ((prev & mid_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & post_mask) != post_mask) all_ones = false;
+    if ((prev & post_mask) != 0) any_ones = true;
+  }
+  if (pany_ones != NULL) { *pany_ones = any_ones; }
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
diff --git a/src/bitmap-old.h b/src/bitmap-old.h
new file mode 100644
index 00000000..f8898935
--- /dev/null
+++ b/src/bitmap-old.h
@@ -0,0 +1,110 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represented as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return mi_bitmap_index_create_ex(idx,bitidx);
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
+// Returns `true` if successful when all previous `count` bits were 0.
+bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
diff --git a/src/bitmap.c b/src/bitmap.c
index 3e6311dc..463d74c7 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1,19 +1,12 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
+Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 
 #include "mimalloc.h"
@@ -21,399 +14,586 @@ between the fields. (This is used in arena allocation)
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  bfields
+-------------------------------------------------------------------------------- */
 
-// The bit mask for a given number of blocks at a specified bit index.
-static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  mi_assert_internal(count > 0);
-  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  if (count == 0) return 0;
-  return ((((size_t)1 << count) - 1) << bitidx);
+static inline size_t mi_bfield_ctz(mi_bfield_t x) {
+  return mi_ctz(x);
+}
+
+static inline size_t mi_bfield_clz(mi_bfield_t x) {
+  return mi_clz(x);
+}
+
+// find the least significant bit that is set (i.e. count trailing zero's)
+// return false if `x==0` (with `*idx` undefined) and true otherwise,
+// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
+static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
+  return mi_bsf(x,idx);
+}
+
+static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+  return mi_rotr(x,r);
+}
+
+// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
+static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
+  if (set) {
+    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
+    return ((old&mask) == 0);
+  }
+  else {
+    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
+    return ((old&mask) == mask);
+  }
+}
+
+// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
+// `already_xset` is true if all bits for the mask were already set/cleared.
+static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
+    *already_xset = ((old&mask) == mask);
+    return ((old&mask) == 0);
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
+    *already_xset = ((old&mask) == 0);
+    return ((old&mask) == mask);
+  }
+}
+
+// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
+  mi_assert_internal(idx < MI_BFIELD_BITS);
+  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
+  return mi_bfield_atomic_xset(set, b, idx);
 }
 
 
+// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != 0) return false; // the mask bits are no longer 0
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
+    return true;
+  }
+  else { // clear
+    mi_bfield_t old = *b;
+    do {
+      if ((old&mask) != mask) return false; // the mask bits are no longer set
+    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
+    return true;
+  }
+}
 
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
+// Check if all bits corresponding to a mask are set/cleared.
+static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
+  mi_assert_internal(mask != 0);
+  if (set) {
+    return ((*b & mask) == mask);
+  }
+  else {
+    return ((*b & mask) == 0);
+  }
+}
 
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map  = mi_atomic_load_relaxed(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise (leaving the bit field as is).
+static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
+  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
+  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
+  return mi_bfield_atomic_try_xset_mask(set,b,mask);
+}
 
-  // search for 0-bit sequence of length count
-  const size_t mask = mi_bitmap_mask_(count, 0);
-  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
 
-#if MI_HAS_FAST_BITSCAN
-  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
-#endif
-  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+/* --------------------------------------------------------------------------------
+ bitmap chunks
+-------------------------------------------------------------------------------- */
 
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    const size_t mapm = (map & m);
-    if (mapm == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const size_t newmap = (map | m);
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) {  // TODO: use weak cas here?
-        // no success, another thread claimed concurrently.. keep going (with updated `map`)
-        continue;
+static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
+  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
+  const size_t i   = cidx / MI_BFIELD_BITS;
+  const size_t idx = cidx % MI_BFIELD_BITS;
+  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
+}
+
+static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
+  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
+  const size_t i         = byte_idx / MI_BFIELD_SIZE;
+  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
+  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
+}
+
+// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
+static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_transition = true;
+  bool all_already_xset = true;
+  size_t idx   = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    bool already_xset;
+    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
+    all_already_xset = all_already_xset && already_xset;
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  *palready_xset = all_already_xset;
+  return all_transition;
+}
+
+// Check if a sequence of `n` bits within a chunk are all set/cleared.
+static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  bool all_xset = true;
+  size_t idx = cidx % MI_BFIELD_BITS;
+  size_t field = cidx / MI_BFIELD_BITS;
+  while (n > 0) {
+    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
+    if (m > n) { m = n; }
+    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
+    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
+    // next field
+    field++;
+    idx = 0;
+    n -= m;
+  }
+  return all_xset;
+}
+
+// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving all bit fields as is.
+static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
+  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(n>0);
+  if (n==0) return true;
+  size_t start_idx = cidx % MI_BFIELD_BITS;
+  size_t start_field = cidx / MI_BFIELD_BITS;
+  size_t end_field = MI_BITMAP_CHUNK_FIELDS;
+  size_t mask_mid = 0;
+  size_t mask_end = 0;
+
+  // first field
+  size_t field = start_field;
+  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
+  if (m > n) { m = n; }
+  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
+  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
+  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
+  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false;
+
+  // done?
+  n -= m;
+  if (n==0) return true;
+
+  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
+
+  // mid fields
+  while (n >= MI_BFIELD_BITS) {
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    mask_mid = ~MI_ZU(0);
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
+    n -= MI_BFIELD_BITS;
+  }
+
+  // last field
+  if (n > 0) {
+    mi_assert_internal(n < MI_BFIELD_BITS);
+    field++;
+    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
+    end_field = field;
+    mask_end = (MI_ZU(1)<<n)-1;
+    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
+  }
+
+  return true;
+
+restore:
+  // field is on the field that failed to set atomically; we need to restore all previous fields
+  mi_assert_internal(field > start_field);
+  while( field > start_field) {
+    field--;
+    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
+    bool already_xset;
+    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
+  }
+  return false;
+}
+
+
+// find least 1-bit in a chunk and try unset it atomically
+// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
+    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
+    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
+    mi_assert_internal(mask != 0);
+    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    size_t cidx;
+    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
+        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+    // try again
+  }
+  #else
+  size_t idx;
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    size_t idx;
+    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
+      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
+        *pidx = (i*MI_BFIELD_BITS + idx);
+        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+        return true;
+      }
+    }
+  }
+  return false;
+  #endif
+}
+
+
+// find least byte in a chunk with all bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
+// todo: try neon version
+static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  while(true) {
+    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
+    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
+    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
+    if (mask == 0) return false;
+    const size_t i = _tzcnt_u32(mask);
+    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
+    const size_t chunk_idx = i / MI_BFIELD_SIZE;
+    const size_t byte_idx  = i % MI_BFIELD_SIZE;
+    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
+      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+      return true;
+    }
+    // try again
+  }
+  #else
+    size_t idx;
+    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+      const mi_bfield_t x = chunk->bfields[i];
+      // has_set8 has low bit in each byte set if the byte in x == 0xFF
+      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+                                    >> 7;                           // shift high bit to low bit
+      size_t idx;
+      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
+        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+        mi_assert_internal((idx%8)==0);
+        const size_t byte_idx = idx/8;
+        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
+          *pidx = (i*MI_BFIELD_BITS) + idx;
+          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        // else continue
+      }
+    }
+    return false;
+  #endif
+}
+
+
+// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
+// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
+// todo: try avx2 and neon version
+// todo: allow spanning across bfield boundaries?
+static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
+  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
+  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
+  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    mi_bfield_t b = chunk->bfields[i];
+    size_t bshift = 0;
+    size_t idx;
+    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+      b >>= idx;
+      bshift += idx;
+      if (bshift + n >= MI_BFIELD_BITS) break;
+
+      if ((b&mask) == mask) { // found a match
+        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
+        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
+          *pidx = (i*MI_BFIELD_BITS) + bshift;
+          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
+          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
+          return true;
+        }
+        else {
+          // if failed to atomically commit, try again from this position
+          b = (chunk->bfields[i] >> bshift);
+        }
       }
       else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
-        return true;
+        // advance
+        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
+        mi_assert_internal(ones>0);
+        bshift += ones;
+        b >>= ones;
       }
     }
+  }
+  return false;
+}
+
+
+// are all bits in a bitmap chunk set?
+static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_test_all_ones(vec);
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x & chunk->bfields[i];
+  }
+  return (~x == 0);
+  #endif
+}
+
+// are all bits in a bitmap chunk clear?
+static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
+  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
+  return _mm256_testz_si256( vec, vec );
+  #else
+  // written like this for vectorization
+  mi_bfield_t x = chunk->bfields[0];
+  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
+    x = x | chunk->bfields[i];
+  }
+  return (x == 0);
+  #endif
+}
+
+/* --------------------------------------------------------------------------------
+ bitmap
+-------------------------------------------------------------------------------- */
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
+  if (!already_zero) {
+    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
+  }
+}
+
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
+
+  // first chunk
+  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
+  if (m > n) { m = n; }
+  bool already_xset;
+  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
+
+  // n can be large so use memset for efficiency for all in-between chunks
+  chunk_idx++;
+  n -= m;
+  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
+  if (mid_chunks > 0) {
+    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
+    chunk_idx += mid_chunks;
+    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
+  }
+
+  // last chunk
+  if (n > 0) {
+    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
+    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
+    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
+  }
+}
+
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
+  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
+}
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
+  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
+  mi_assert_internal(idx%8 == 0);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
+  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
+  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  bool local_already_xset;
+  if (already_xset==NULL) { already_xset = &local_already_xset;  }
+  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
+  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
+
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
+}
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
+  mi_assert_internal(n>0);
+  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
+  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
+  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
+  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
+  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
+  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
+  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
+}
+
+
+#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
+  { size_t _set_idx; \
+    size_t _start = start % MI_BFIELD_BITS; \
+    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
+    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
+      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
+
+#define mi_bitmap_forall_set_chunks_end() \
+      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
+      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
+      _any_set >>= 1; \
+    } \
+  }
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
+      return true;
+    }
     else {
-      // on to the next bit range
-#if MI_HAS_FAST_BITSCAN
-      mi_assert_internal(mapm != 0);
-      const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx));
-      mi_assert_internal(shift > 0 && shift <= count);
-#else
-      const size_t shift = 1;
-#endif
-      bitidx += shift;
-      m <<= shift;
-    }
-  }
-  // no bits found
-  return false;
-}
-
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
-}
-
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
-  if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); }
-  return ((prev & mask) == 0);
-}
-
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  const size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) { *any_ones = ((field & mask) != 0); }
-  return ((field & mask) == mask);
-}
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically.
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const size_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
-  size_t expected = mi_atomic_load_relaxed(&bitmap[idx]);
-  do  {
-    if ((expected & mask) != 0) return false;
-  }
-  while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask));
-  mi_assert_internal((expected & mask) == 0);
-  return true;
-}
-
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
-}
-
-
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
-
-// Try to atomically claim a sequence of `count` bits starting from the field
-// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
-// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`)
-static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-
-  // check initial trailing zeros
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t map = mi_atomic_load_relaxed(field);
-  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
-  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
-  if (initial == 0)     return false;
-  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);    // no need to cross fields (this case won't happen for us)
-  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
-
-  // scan ahead
-  size_t found = initial;
-  size_t mask = 0;     // mask bits for the final field
-  while(found < count) {
-    field++;
-    map = mi_atomic_load_relaxed(field);
-    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
-    mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS);
-    mask = mi_bitmap_mask_(mask_bits, 0);
-    if ((map & mask) != 0) return false;  // some part is already claimed
-    found += mask_bits;
-  }
-  mi_assert_internal(field < &bitmap[bitmap_fields]);
-
-  // we found a range of contiguous zeros up to the final field; mask contains mask in the final field
-  // now try to claim the range atomically
-  mi_bitmap_field_t* const final_field = field;
-  const size_t final_mask = mask;
-  mi_bitmap_field_t* const initial_field = &bitmap[idx];
-  const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial;
-  const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx);
-
-  // initial field
-  size_t newmap;
-  field = initial_field;
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | initial_mask);
-    if ((map & initial_mask) != 0) { goto rollback; };
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // intermediate fields
-  while (++field < final_field) {
-    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    map = 0;
-    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
-  }
-
-  // final field
-  mi_assert_internal(field == final_field);
-  map = mi_atomic_load_relaxed(field);
-  do {
-    newmap = (map | final_mask);
-    if ((map & final_mask) != 0) { goto rollback; }
-  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-
-  // claimed!
-  mi_stat_counter_increase(stats->arena_crossover_count,1);
-  *bitmap_idx = mi_bitmap_index_create(idx, initial_idx);
-  return true;
-
-rollback:
-  // roll back intermediate fields
-  // (we just failed to claim `field` so decrement first)
-  while (--field > initial_field) {
-    newmap = 0;
-    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
-    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
-    mi_atomic_store_release(field, newmap);
-  }
-  if (field == initial_field) {               // (if we failed on the initial field, `field + 1 == initial_field`)
-    map = mi_atomic_load_relaxed(field);
-    do {
-      mi_assert_internal((map & initial_mask) == initial_mask);
-      newmap = (map & ~initial_mask);
-    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
-  }
-  mi_stat_counter_increase(stats->arena_rollback_count,1);
-  // retry? (we make a recursive call instead of goto to be able to use const declarations)
-  if (retries <= 2) {
-    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats);
-  }
-  else {
-    return false;
-  }
-}
-
-
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) {
-  mi_assert_internal(count > 0);
-  if (count <= 2) {
-    // we don't bother with crossover fields for small counts
-    return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
-  }
-
-  // visit the fields
-  size_t idx = start_field_idx;
-  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
-    if (idx >= bitmap_fields) { idx = 0; } // wrap
-    // first try to claim inside a field
-    /*
-    if (count <= MI_BITMAP_FIELD_BITS) {
-      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-        return true;
+      // we may find that all are unset only on a second iteration but that is ok as
+      // _any_set is a conservative approximation.
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
       }
     }
-    */
-    // if that fails, then try to claim across fields
-    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) {
-      return true;
-    }
   }
+  mi_bitmap_forall_set_chunks_end();
   return false;
 }
 
-// Helper for masks across fields; returns the mid count, post_mask may be 0
-static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
-  MI_UNUSED(bitmap_fields);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
-    *pre_mask = mi_bitmap_mask_(count, bitidx);
-    *mid_mask = 0;
-    *post_mask = 0;
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
-    return 0;
-  }
-  else {
-    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
-    mi_assert_internal(pre_bits < count);
-    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
-    count -= pre_bits;
-    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
-    *mid_mask = MI_BITMAP_FIELD_FULL;
-    count %= MI_BITMAP_FIELD_BITS;
-    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
-    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
-    return mid_count;
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
+      mi_assert_internal((*pidx % 8) == 0);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
   }
+  mi_bitmap_forall_set_chunks_end();
+  return false;
 }
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_one = true;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);   // clear first part
-  if ((prev & pre_mask) != pre_mask) all_one = false;
-  while(mid_count-- > 0) {
-    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);        // clear mid part
-    if ((prev & mid_mask) != mid_mask) all_one = false;
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
+  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
+  // TODO: allow spanning across chunk boundaries
+  if (n == 0 || n > MI_BFIELD_BITS) return false;
+  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  {
+    size_t cidx;
+    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
+      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
+      return true;
+    }
+    else {
+      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
+      }
+    }
   }
-  if (post_mask!=0) {
-    prev = mi_atomic_and_acq_rel(field, ~post_mask);         // clear end part
-    if ((prev & post_mask) != post_mask) all_one = false;
-  }
-  return all_one;
-}
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_zero = true;
-  bool any_zero = false;
-  _Atomic(size_t)*field = &bitmap[idx];
-  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
-  if ((prev & pre_mask) != 0) all_zero = false;
-  if ((prev & pre_mask) != pre_mask) any_zero = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_or_acq_rel(field++, mid_mask);
-    if ((prev & mid_mask) != 0) all_zero = false;
-    if ((prev & mid_mask) != mid_mask) any_zero = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_or_acq_rel(field, post_mask);
-    if ((prev & post_mask) != 0) all_zero = false;
-    if ((prev & post_mask) != post_mask) any_zero = true;
-  }
-  if (pany_zero != NULL) { *pany_zero = any_zero; }
-  return all_zero;
-}
-
-
-// Returns `true` if all `count` bits were 1.
-// `any_ones` is `true` if there was at least one bit set to one.
-static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
-  size_t idx = mi_bitmap_index_field(bitmap_idx);
-  size_t pre_mask;
-  size_t mid_mask;
-  size_t post_mask;
-  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
-  bool all_ones = true;
-  bool any_ones = false;
-  mi_bitmap_field_t* field = &bitmap[idx];
-  size_t prev = mi_atomic_load_relaxed(field++);
-  if ((prev & pre_mask) != pre_mask) all_ones = false;
-  if ((prev & pre_mask) != 0) any_ones = true;
-  while (mid_count-- > 0) {
-    prev = mi_atomic_load_relaxed(field++);
-    if ((prev & mid_mask) != mid_mask) all_ones = false;
-    if ((prev & mid_mask) != 0) any_ones = true;
-  }
-  if (post_mask!=0) {
-    prev = mi_atomic_load_relaxed(field);
-    if ((prev & post_mask) != post_mask) all_ones = false;
-    if ((prev & post_mask) != 0) any_ones = true;
-  }
-  if (pany_ones != NULL) { *pany_ones = any_ones; }
-  return all_ones;
-}
-
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
+  mi_bitmap_forall_set_chunks_end();
+  return false;
 }
diff --git a/src/bitmap.h b/src/bitmap.h
index f8898935..198a2902 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -6,105 +6,87 @@ terms of the MIT license. A copy of the license can be found in the file
 -----------------------------------------------------------------------------*/
 
 /* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically,
-represented as an array of fields where each field is a machine word (`size_t`)
-
-There are two api's; the standard one cannot have sequences that cross
-between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
-(this is used in region allocation)
-
-The `_across` postfixed functions do allow sequences that can cross over
-between the fields. (This is used in arena allocation)
+Concurrent bitmap that can set/reset sequences of bits atomically
 ---------------------------------------------------------------------------- */
 #pragma once
 #ifndef MI_BITMAP_H
 #define MI_BITMAP_H
 
-/* -----------------------------------------------------------
-  Bitmap definition
------------------------------------------------------------ */
+/* --------------------------------------------------------------------------------
+  Definitions
+-------------------------------------------------------------------------------- */
 
-#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+typedef size_t mi_bfield_t;
 
-// An atomic bitmap of `size_t` fields
-typedef _Atomic(size_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*  mi_bitmap_t;
+#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
+#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
+#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
+#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
+#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
+#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
 
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
+#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
+#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
 
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return mi_bitmap_index_create_ex(idx,bitidx);
-}
-
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
-
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
-
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
-
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
------------------------------------------------------------ */
-
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
-bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. 
-// Returns `true` if successful when all previous `count` bits were 0.
-bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
-
-bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
+  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
+} mi_bitmap_chunk_t;
 
 
-//--------------------------------------------------------------------------
-// the `_across` functions work on bitmaps where sequences can cross over
-// between the fields. This is used in arena allocation
-//--------------------------------------------------------------------------
+typedef mi_decl_align(32) struct mi_bitmap_s {
+  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
+  _Atomic(mi_bfield_t)any_set;
+} mi_bitmap_t;
 
-// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
-// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
-bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats);
+#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
 
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+/* --------------------------------------------------------------------------------
+  Bitmap
+-------------------------------------------------------------------------------- */
 
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+typedef bool  mi_bit_t;
+#define MI_BIT_SET    (true)
+#define MI_BIT_CLEAR  (false)
 
-bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
-bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
+void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
 
-#endif
+// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
+void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared.
+bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
+
+// Is a sequence of n bits already all set/cleared?
+bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
+// and false otherwise leaving the bitmask as is.
+mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
+
+// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
+// and false otherwise leaving the bitmask as is.
+// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
+mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
+
+// Find a set bit in a bitmap and atomically unset it. Returns true on success,
+// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
+// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
+// (to reduce thread contention).
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
+
+// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
+
+// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
+
+#endif // MI_XBITMAP_H
diff --git a/src/free.c b/src/free.c
index f2e30b65..e1cc9276 100644
--- a/src/free.c
+++ b/src/free.c
@@ -24,7 +24,7 @@ static void   mi_stat_free(const mi_page_t* page, const mi_block_t* block);
 // ------------------------------------------------------
 
 // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block);
+static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_block_t* block);
 
 // regular free of a (thread local) block pointer
 // fast path written carefully to prevent spilling on the stack
@@ -57,7 +57,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool
 mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) {
   mi_assert_internal(page!=NULL && p!=NULL);
 
-  size_t diff = (uint8_t*)p - page->page_start;
+  size_t diff = (uint8_t*)p - mi_page_start(page);
   size_t adjust;
   if mi_likely(page->block_size_shift != 0) {
     adjust = diff & (((size_t)1 << page->block_size_shift) - 1);
@@ -82,72 +82,55 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo
 #endif
 
 // free a local pointer  (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
-  MI_UNUSED(segment);
+static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p);
   mi_block_check_unguard(page, block, p);
   mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */);
 }
 
 // free a pointer owned by another thread (page parameter comes first for better codegen)
-static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept {
+static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept {
   mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865)
   mi_block_check_unguard(page, block, p);
-  mi_free_block_mt(page, segment, block);
+  mi_free_block_mt(page, block);
 }
 
 // generic free (for runtime integration)
-void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
-  if (is_local) mi_free_generic_local(page,segment,p);
-           else mi_free_generic_mt(page,segment,p);
+void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  if (is_local) mi_free_generic_local(page,p);
+           else mi_free_generic_mt(page,p);
 }
 
 // Get the segment data belonging to a pointer
 // This is just a single `and` in release mode but does further checks in debug mode
 // (and secure mode) to see if this was a valid pointer.
-static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 {
-  MI_UNUSED(msg);
-
-  #if (MI_DEBUG>0)
+  MI_UNUSED_RELEASE(msg);
+  #if MI_DEBUG
   if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) {
     _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
     return NULL;
   }
   #endif
-
-  mi_segment_t* const segment = _mi_ptr_segment(p);
-  if mi_unlikely(segment==NULL) return segment;
-
-  #if (MI_DEBUG>0)
-  if mi_unlikely(!mi_is_in_heap_region(p)) {
-    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
-    if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    }
+  mi_page_t* const page = _mi_ptr_page(p);
+  #if MI_DEBUG
+  if (page == MI_PAGE_PTR_INVALID) {
+    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
   }
   #endif
-  #if (MI_DEBUG>0 || MI_SECURE>=4)
-  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
-    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
-    return NULL;
-  }
-  #endif
-
-  return segment;
+  return page;
 }
 
 // Free a block
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
-  if mi_unlikely(segment==NULL) return;
-
-  const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-  mi_page_t* const page = _mi_segment_page_of(segment, p);
+  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
+  if mi_unlikely(page==NULL) return;
 
+  
+  const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
   if mi_likely(is_local) {                        // thread-local free?
     if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
@@ -156,12 +139,12 @@ void mi_free(void* p) mi_attr_noexcept
     }
     else {
       // page is full or contains (inner) aligned blocks; use generic path
-      mi_free_generic_local(page, segment, p);
+      mi_free_generic_local(page, p);
     }
   }
   else {
     // not thread-local; use generic path
-    mi_free_generic_mt(page, segment, p);
+    mi_free_generic_mt(page, p);
   }
 }
 
@@ -169,10 +152,8 @@ void mi_free(void* p) mi_attr_noexcept
 bool _mi_free_delayed_block(mi_block_t* block) {
   // get segment and page
   mi_assert_internal(block!=NULL);
-  const mi_segment_t* const segment = _mi_ptr_segment(block);
-  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
-  mi_assert_internal(_mi_thread_id() == segment->thread_id);
-  mi_page_t* const page = _mi_segment_page_of(segment, block);
+  mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block");
+  mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
 
   // Clear the no-delayed flag so delayed freeing is used again for this page.
   // This must be done before collecting the free lists on this page -- otherwise
@@ -242,20 +223,19 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block
 }
 
 // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
-static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block)
+static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block)
 {
-  // first see if the segment was abandoned and if we can reclaim it into our thread
-  if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 &&
-      #if MI_HUGE_PAGE_ABANDON
-      segment->page_kind != MI_PAGE_HUGE &&
-      #endif
-      mi_atomic_load_relaxed(&segment->thread_id) == 0 &&  // segment is abandoned?
+  // first see if the page was abandoned and if we can reclaim it into our thread
+  if (mi_page_is_abandoned(page) &&
+      (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 ||
+       mi_page_is_singleton(page)  // only one block, and we are free-ing it
+      ) &&
       mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944))
   {
-    // the segment is abandoned, try to reclaim it into our heap
-    if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) {
-      mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
-      mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc);
+    // the page is abandoned, try to reclaim it into our heap
+    if (_mi_heap_try_reclaim(mi_heap_get_default(), page)) {  // TODO: avoid putting it in the full free queue
+      mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
+      // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc);
       mi_free(block);  // recursively free as now it will be a local free in our heap
       return;
     }
@@ -272,17 +252,12 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg
   // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
   _mi_padding_shrink(page, block, sizeof(mi_block_t));
 
-  if (segment->page_kind == MI_PAGE_HUGE) {
-    #if MI_HUGE_PAGE_ABANDON
-    // huge page segments are always abandoned and can be freed immediately
-    _mi_segment_huge_page_free(segment, page, block);
-    return;
-    #else
+  if (mi_page_is_huge(page)) {
+    mi_assert_internal(mi_page_is_singleton(page));
     // huge pages are special as they occupy the entire segment
     // as these are large we reset the memory occupied by the page so it is available to other threads
     // (as the owning thread needs to actually free the memory later).
-    _mi_segment_huge_page_reset(segment, page, block);
-    #endif
+    _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively    
   }
   else {
     #if (MI_DEBUG>0) && !MI_TRACK_ENABLED  && !MI_TSAN       // note: when tracking, cannot use mi_usable_size with multi-threading
@@ -316,9 +291,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p
 }
 
 static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
-  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
-  if mi_unlikely(segment==NULL) return 0;
-  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  const mi_page_t* const page = mi_checked_ptr_page(p,msg);
+  if mi_unlikely(page==NULL) return 0;
   if mi_likely(!mi_page_has_aligned(page)) {
     const mi_block_t* block = (const mi_block_t*)p;
     return mi_page_usable_size_of(page, block);
@@ -514,20 +488,20 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
 // only maintain stats for smaller objects if requested
 #if (MI_STAT>0)
 static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
-#if (MI_STAT < 2)
+  #if (MI_STAT < 2)
   MI_UNUSED(block);
-#endif
+  #endif
   mi_heap_t* const heap = mi_heap_get_default();
   const size_t bsize = mi_page_usable_block_size(page);
-#if (MI_STAT>1)
+  #if (MI_STAT>1)
   const size_t usize = mi_page_usable_size_of(page, block);
   mi_heap_stat_decrease(heap, malloc, usize);
-#endif
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+  #endif
+  if (bsize <= MI_LARGE_MAX_OBJ_SIZE) {
     mi_heap_stat_decrease(heap, normal, bsize);
-#if (MI_STAT > 1)
+    #if (MI_STAT > 1)
     mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
-#endif
+    #endif
   }
   else {
     const size_t bpsize = mi_page_block_size(page);  // match stat in page.c:mi_huge_page_alloc
diff --git a/src/heap.c b/src/heap.c
index 581b3f71..e4955ba7 100644
--- a/src/heap.c
+++ b/src/heap.c
@@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
-#include "mimalloc/atomic.h"
 #include "mimalloc/prim.h"  // mi_prim_get_default_heap
 
-#include <string.h>  // memset, memcpy
-
 #if defined(_MSC_VER) && (_MSC_VER < 1920)
 #pragma warning(disable:4204)  // non-constant aggregate initializer
 #endif
@@ -258,7 +255,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
   mi_assert_internal(heap != NULL);
   mi_assert_internal(mi_heap_is_initialized(heap));
   // TODO: copy full empty heap instead?
-  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+  _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
   _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
   heap->thread_delayed_free = NULL;
   heap->page_count = 0;
diff --git a/src/os.c b/src/os.c
index 36b167cb..83521766 100644
--- a/src/os.c
+++ b/src/os.c
@@ -59,6 +59,10 @@ size_t _mi_os_large_page_size(void) {
   return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size());
 }
 
+size_t _mi_os_virtual_address_bits(void) {
+  return mi_os_mem_config.virtual_address_bits;
+}
+
 bool _mi_os_use_large_page(size_t size, size_t alignment) {
   // if we have access, check the size and alignment requirements
   if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false;
@@ -103,58 +107,10 @@ static void* mi_align_down_ptr(void* p, size_t alignment) {
   return (void*)_mi_align_down((uintptr_t)p, alignment);
 }
 
-
-/* -----------------------------------------------------------
-  aligned hinting
--------------------------------------------------------------- */
-
-// On systems with enough virtual address bits, we can do efficient aligned allocation by using
-// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address
-// space (64TiB) we use this technique. (but see issue #939)
-#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT)
-static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
-
-// Return a MI_SEGMENT_SIZE aligned address that is probably available.
-// If this returns NULL, the OS will determine the address but on some OS's that may not be
-// properly aligned which can be more costly as it needs to be adjusted afterwards.
-// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
-// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
-//  in the middle of the 2TiB - 6TiB address range (see issue #372))
-
-#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
-#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
-#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
-
-void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size)
-{
-  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
-  if (mi_os_mem_config.virtual_address_bits < 46) return NULL;  // < 64TiB virtual address space
-  size = _mi_align_up(size, MI_SEGMENT_SIZE);
-  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
-  #if (MI_SECURE>0)
-  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
-  #endif
-
-  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
-  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
-    uintptr_t init = MI_HINT_BASE;
-    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
-    uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap());
-    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
-    #endif
-    uintptr_t expected = hint + size;
-    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
-    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
-  }
-  if (hint%try_alignment != 0) return NULL;
-  return (void*)hint;
-}
-#else
 void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
   MI_UNUSED(try_alignment); MI_UNUSED(size);
   return NULL;
 }
-#endif
 
 
 /* -----------------------------------------------------------
@@ -380,12 +336,10 @@ void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) {
 ----------------------------------------------------------- */
 
 void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) {
-  mi_assert(offset <= MI_SEGMENT_SIZE);
   mi_assert(offset <= size);
   mi_assert((alignment % _mi_os_page_size()) == 0);
   *memid = _mi_memid_none();
   if (stats == NULL) stats = &_mi_stats_main;
-  if (offset > MI_SEGMENT_SIZE) return NULL;
   if (offset == 0) {
     // regular aligned allocation
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats);
@@ -605,7 +559,6 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
     #endif
     }
     end = start + size;
-    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
   } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
 
   if (total_size != NULL) *total_size = size;
diff --git a/src/page-map.c b/src/page-map.c
new file mode 100644
index 00000000..d3fcef79
--- /dev/null
+++ b/src/page-map.c
@@ -0,0 +1,90 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2023-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+mi_decl_cache_align signed char* _mi_page_map = NULL;
+static bool        mi_page_map_all_committed = false;
+static size_t      mi_blocks_per_commit_bit = 1;
+static mi_memid_t  mi_page_map_memid;
+static mi_bitmap_t mi_page_map_commit;
+
+static bool mi_page_map_init(void) {
+  size_t vbits = _mi_os_virtual_address_bits();
+  if (vbits >= 48) vbits = 47;
+  // 1 byte per block =  2 GiB for 128 TiB address space  (48 bit = 256 TiB address space)
+  //                    64 KiB for 4 GiB address space (on 32-bit)
+  const size_t page_map_size = (MI_ZU(1) << (vbits >> MI_ARENA_BLOCK_SHIFT));
+  
+  const size_t min_commit_size = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS);
+  mi_blocks_per_commit_bit = mi_block_count_of_size(min_commit_size);
+
+  mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems
+  _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 0, mi_page_map_all_committed, true, &mi_page_map_memid, NULL);
+  if (_mi_page_map==NULL) {
+    _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB);
+    return false;
+  }
+  if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
+    _mi_warning_message("the page map was committed on-demand but not zero initialized!\n");
+    _mi_memzero_aligned(_mi_page_map, page_map_size);
+  }
+  return true;
+}
+
+static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) {
+  size_t page_size;
+  *page_start = mi_page_area(page, &page_size);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; }  // furthest interior pointer
+  *block_count = mi_block_count_of_size(page_size);
+  return ((uintptr_t)*page_start >> MI_ARENA_BLOCK_SHIFT);
+}
+
+
+
+void _mi_page_map_register(mi_page_t* page) {
+  if mi_unlikely(_mi_page_map == NULL) {
+    if (!mi_page_map_init()) return;
+  }
+  uint8_t* page_start;
+  size_t   block_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
+  
+  // is the page map area that contains the page address committed?
+  if (!mi_page_map_all_committed) {
+    const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit);
+    const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit;
+    for (size_t i = 0; i < commit_bit_count; i++) {  // per bit to avoid crossing over bitmap chunks
+      if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) {
+        // this may race, in which case we do multiple commits (which is ok)
+        _mi_os_commit(page_start + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL);
+        mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL);
+      }
+    }
+  }
+
+  // set the offsets
+  for (int i = 0; i < block_count; i++) {
+    mi_assert_internal(i < 128);
+    _mi_page_map[idx + i] = (int8_t)(-i-1);
+  }
+}
+
+
+void _mi_page_map_unregister(mi_page_t* page) {
+  mi_assert_internal(_mi_page_map != NULL);
+  
+  // get index and count
+  uint8_t* page_start;
+  size_t   block_count;
+  const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count);
+
+  // unset the offsets
+  _mi_memzero(_mi_page_map + idx, block_count);
+}
diff --git a/src/page.c b/src/page.c
index c681d6d0..a00ff615 100644
--- a/src/page.c
+++ b/src/page.c
@@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) {
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
-  uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+  uint8_t* page_area = mi_page_area(page, &psize);
   mi_block_t* start = (mi_block_t*)page_area;
   mi_block_t* end   = (mi_block_t*)(page_area + psize);
   while(p != NULL) {
@@ -83,10 +83,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_assert_internal(page->capacity <= page->reserved);
 
   // const size_t bsize = mi_page_block_size(page);
-  mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = mi_page_start(page);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE));
   //mi_assert_internal(start + page->capacity*page->block_size == page->top);
 
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
@@ -122,15 +119,11 @@ bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(page->keys[0] != 0);
   #endif
   if (mi_page_heap(page)!=NULL) {
-    mi_segment_t* segment = _mi_page_segment(page);
-    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0);
-    #if MI_HUGE_PAGE_ABANDON
-    if (segment->page_kind != MI_PAGE_HUGE)
-    #endif
+    mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page));
       mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
     }
   }
@@ -274,16 +267,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   #if !MI_HUGE_PAGE_ABANDON
   mi_assert_internal(pq != NULL);
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
-  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size);
+  mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
   // a fresh page was found, initialize it
   const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
@@ -384,7 +374,6 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
   mi_heap_t* pheap = mi_page_heap(page);
 
   // remove from our page list
-  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
   mi_page_queue_remove(pq, page);
 
   // page is no longer associated with our heap
@@ -399,8 +388,8 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 #endif
 
   // and abandon it
-  mi_assert_internal(mi_page_heap(page) == NULL);
-  _mi_segment_page_abandon(page,segments_tld);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  _mi_arena_page_abandon(page,&pheap->tld);
 }
 
 // force abandon a page
@@ -411,8 +400,7 @@ void _mi_page_force_abandon(mi_page_t* page) {
 
   // ensure this page is no longer in the heap delayed free list
   _mi_heap_delayed_free_all(heap);
-  // We can still access the page meta-info even if it is freed as we ensure 
-  // in `mi_segment_force_abandon` that the segment is not freed (yet)
+  // TODO: can we still access the page meta-info even if it is freed?
   if (page->capacity == 0) return; // it may have been freed now
 
   // and now unlink it from the page queue and abandon (or free)
@@ -433,17 +421,18 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
   mi_assert_internal(mi_page_all_free(page));
   mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
 
+  mi_heap_t* pheap = mi_page_heap(page);
+
   // no more aligned blocks in here
   mi_page_set_has_aligned(page, false);
 
   // remove from the page list
   // (no need to do _mi_heap_delayed_free first as all blocks are already free)
-  mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments;
   mi_page_queue_remove(pq, page);
 
   // and free it
   mi_page_set_heap(page,NULL);
-  _mi_segment_page_free(page, force, segments_tld);
+  _mi_arena_page_free(page, force, &pheap->tld);
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@@ -474,7 +463,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
   if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) {  // not full or huge queue?
     if (pq->last==page && pq->first==page) { // the only page in the queue?
       mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
-      page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
       mi_heap_t* heap = mi_page_heap(page);
       mi_assert_internal(pq >= heap->pages);
       const size_t index = pq - heap->pages;
@@ -639,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   size_t page_size;
   //uint8_t* page_start =
-  _mi_segment_page_start(_mi_page_segment(page), page, &page_size);
+  mi_page_area(page, &page_size);
   mi_stat_counter_increase(tld->stats.pages_extended, 1);
 
   // calculate the extend count
@@ -676,15 +665,13 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 // Initialize a fresh page
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
   mi_assert(page != NULL);
-  mi_segment_t* segment = _mi_page_segment(page);
-  mi_assert(segment != NULL);
   mi_assert_internal(block_size > 0);
   // set fields
   mi_page_set_heap(page, heap);
   page->block_size = block_size;
   size_t page_size;
-  page->page_start = _mi_segment_page_start(segment, page, &page_size);
-  mi_track_mem_noaccess(page->page_start,page_size);
+  uint8_t* page_start = mi_page_area(page, &page_size);
+  mi_track_mem_noaccess(page_start,page_size);
   mi_assert_internal(page_size / block_size < (1L<<16));
   page->reserved = (uint16_t)(page_size / block_size);
   mi_assert_internal(page->reserved > 0);
@@ -692,15 +679,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  page->free_is_zero = page->is_zero_init;
+  page->free_is_zero = page->memid.initially_zero;
   #if MI_DEBUG>2
-  if (page->is_zero_init) {
+  if (page->memid.initially_zero) {
     mi_track_mem_defined(page->page_start, page_size);
-    mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size));
+    mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
   if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size));
+    page->block_size_shift = (uint8_t)mi_ctz(block_size);
   }
   else {
     page->block_size_shift = 0;
@@ -734,13 +721,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
 #define MI_MAX_CANDIDATE_SEARCH  (8)
 
-// is the page not yet used up to its reserved space?
-static bool mi_page_is_expandable(const mi_page_t* page) {
-  mi_assert_internal(page != NULL);
-  mi_assert_internal(page->capacity <= page->reserved);
-  return (page->capacity < page->reserved);
-}
-
 
 // Find a page with free blocks of `page->block_size`.
 static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
@@ -907,7 +887,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
   #if MI_HUGE_PAGE_ABANDON
   mi_page_queue_t* pq = NULL;
   #else
-  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1);  // always in the huge queue regardless of the block size
+  mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1);  // always in the huge queue regardless of the block size
   mi_assert_internal(mi_page_queue_is_huge(pq));
   #endif
   mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
@@ -915,10 +895,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
     mi_assert_internal(mi_page_block_size(page) >= size);
     mi_assert_internal(mi_page_immediate_available(page));
     mi_assert_internal(mi_page_is_huge(page));
-    mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE);
-    mi_assert_internal(_mi_page_segment(page)->used==1);
+    mi_assert_internal(mi_page_is_singleton(page));
     #if MI_HUGE_PAGE_ABANDON
-    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_assert_internal(mi_page_is_abandoned(page));
     mi_page_set_heap(page, NULL);
     #endif
     mi_heap_stat_increase(heap, huge, mi_page_block_size(page));
@@ -933,7 +912,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a
 static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
   // huge allocation?
   const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
-  if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+  if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) {
     if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) {
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
       return NULL;
diff --git a/src/static.c b/src/static.c
index 9e06ce05..b34d5d42 100644
--- a/src/static.c
+++ b/src/static.c
@@ -20,7 +20,7 @@ terms of the MIT license. A copy of the license can be found in the file
 // containing the whole library. If it is linked first
 // it will override all the standard library allocation
 // functions (on Unix's).
-#include "alloc.c"          // includes alloc-override.c
+#include "alloc.c"          // includes alloc-override.c and free.c
 #include "alloc-aligned.c"
 #include "alloc-posix.c"
 #include "arena.c"
@@ -31,6 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "options.c"
 #include "os.c"
 #include "page.c"           // includes page-queue.c
+#include "page-map.c"
 #include "random.c"
 #include "segment.c"
 #include "segment-map.c"
diff --git a/src/xbitmap.c b/src/xbitmap.c
deleted file mode 100644
index 68525c84..00000000
--- a/src/xbitmap.c
+++ /dev/null
@@ -1,599 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2024 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically
----------------------------------------------------------------------------- */
-
-#include "mimalloc.h"
-#include "mimalloc/internal.h"
-#include "mimalloc/bits.h"
-#include "xbitmap.h"
-
-/* --------------------------------------------------------------------------------
-  bfields
--------------------------------------------------------------------------------- */
-
-static inline size_t mi_bfield_ctz(mi_bfield_t x) {
-  return mi_ctz(x);
-}
-
-static inline size_t mi_bfield_clz(mi_bfield_t x) {
-  return mi_clz(x);
-}
-
-// find the least significant bit that is set (i.e. count trailing zero's)
-// return false if `x==0` (with `*idx` undefined) and true otherwise,
-// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
-static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
-  return mi_bsf(x,idx);
-}
-
-static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-  return mi_rotr(x,r);
-}
-
-// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0).
-static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  const mi_bfield_t mask = ((mi_bfield_t)1)<<idx;
-  if (set) {
-    const mi_bfield_t old = mi_atomic(fetch_or_explicit)(b, mask, mi_memory_order(acq_rel));
-    return ((old&mask) == 0);
-  }
-  else {
-    mi_bfield_t old = mi_atomic(fetch_and_explicit)(b, ~mask, mi_memory_order(acq_rel));
-    return ((old&mask) == mask);
-  }
-}
-
-// Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's)
-// `already_xset` is true if all bits for the mask were already set/cleared.
-static bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits until success
-    *already_xset = ((old&mask) == mask);
-    return ((old&mask) == 0);
-  }
-  else { // clear
-    mi_bfield_t old = *b;
-    while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits until success
-    *already_xset = ((old&mask) == 0);
-    return ((old&mask) == mask);
-  }
-}
-
-// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) {
-  mi_assert_internal(idx < MI_BFIELD_BITS);
-  // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first
-  return mi_bfield_atomic_xset(set, b, idx);
-}
-
-
-// Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0)
-// and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != 0) return false; // the mask bits are no longer 0
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask));  // try to atomically set the mask bits
-    return true;
-  }
-  else { // clear
-    mi_bfield_t old = *b;
-    do {
-      if ((old&mask) != mask) return false; // the mask bits are no longer set
-    } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask));  // try to atomically clear the mask bits
-    return true;
-  }
-}
-
-// Check if all bits corresponding to a mask are set/cleared.
-static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
-  mi_assert_internal(mask != 0);
-  if (set) {
-    return ((*b & mask) == mask);
-  }
-  else {
-    return ((*b & mask) == 0);
-  }
-}
-
-// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise (leaving the bit field as is).
-static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) {
-  mi_assert_internal(byte_idx < MI_BFIELD_SIZE);
-  const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8);
-  return mi_bfield_atomic_try_xset_mask(set,b,mask);
-}
-
-
-/* --------------------------------------------------------------------------------
- bitmap chunks
--------------------------------------------------------------------------------- */
-
-static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) {
-  mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS);
-  const size_t i   = cidx / MI_BFIELD_BITS;
-  const size_t idx = cidx % MI_BFIELD_BITS;
-  return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx);
-}
-
-static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) {
-  mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS);
-  const size_t i         = byte_idx / MI_BFIELD_SIZE;
-  const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE;
-  return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx);
-}
-
-// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0)
-static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  bool all_transition = true;
-  bool all_already_xset = true;
-  size_t idx   = cidx % MI_BFIELD_BITS;
-  size_t field = cidx / MI_BFIELD_BITS;  
-  while (n > 0) {
-    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
-    if (m > n) { m = n; }
-    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    bool already_xset;
-    all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset);
-    all_already_xset = all_already_xset && already_xset;
-    // next field
-    field++;
-    idx = 0;
-    n -= m;
-  }
-  *palready_xset = all_already_xset;
-  return all_transition;
-}
-
-// Check if a sequence of `n` bits within a chunk are all set/cleared.
-static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  bool all_xset = true;
-  size_t idx = cidx % MI_BFIELD_BITS;
-  size_t field = cidx / MI_BFIELD_BITS;
-  while (n > 0) {
-    size_t m = MI_BFIELD_BITS - idx;   // m is the bits to xset in this field
-    if (m > n) { m = n; }
-    mi_assert_internal(idx + m <= MI_BFIELD_BITS);
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << idx);
-    all_xset = all_xset && mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask);
-    // next field
-    field++;
-    idx = 0;
-    n -= m;
-  }
-  return all_xset;
-}
-
-// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0),
-// and false otherwise leaving all bit fields as is.
-static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) {
-  mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(n>0);
-  if (n==0) return true;
-  size_t start_idx = cidx % MI_BFIELD_BITS;
-  size_t start_field = cidx / MI_BFIELD_BITS;
-  size_t end_field = MI_BITMAP_CHUNK_FIELDS; 
-  size_t mask_mid = 0;
-  size_t mask_end = 0; 
-  
-  // first field  
-  size_t field = start_field;
-  size_t m = MI_BFIELD_BITS - start_idx;   // m is the bits to xset in this field
-  if (m > n) { m = n; }
-  mi_assert_internal(start_idx + m <= MI_BFIELD_BITS);
-  mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS);
-  const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<<m)-1) << start_idx);
-  if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false; 
-  
-  // done?
-  n -= m;
-  if (n==0) return true;
-
-  // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields
-
-  // mid fields
-  while (n >= MI_BFIELD_BITS) {
-    field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    mask_mid = ~MI_ZU(0);
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore;
-    n -= MI_BFIELD_BITS;
-  }
-
-  // last field
-  if (n > 0) {    
-    mi_assert_internal(n < MI_BFIELD_BITS);
-    field++;
-    mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS);
-    end_field = field;
-    mask_end = (MI_ZU(1)<<n)-1;
-    if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore;
-  }
-
-  return true;
-  
-restore:
-  // field is on the field that failed to set atomically; we need to restore all previous fields
-  mi_assert_internal(field > start_field);
-  while( field > start_field) {
-    field--;
-    const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid));
-    bool already_xset;
-    mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset);
-  }
-  return false;
-}
-
-
-// find least 1-bit in a chunk and try unset it atomically
-// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
-// todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  while(true) {
-    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-    if (_mm256_testz_si256(vec,vec)) return false;   // vec == 0 ?
-    const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1  : 0)
-    const uint32_t mask = ~_mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0)
-    mi_assert_internal(mask != 0);
-    const size_t chunk_idx = _tzcnt_u32(mask) / 8;           // tzcnt == 0, 8, 16, or 24
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) {           // find the bit that is set
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) {  // unset atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-    // try again
-  }
-  #else
-  size_t idx;
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    size_t idx;
-    if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit
-      if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) {  // try unset atomically
-        *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-        return true;
-      }
-    }
-  }
-  return false;
-  #endif
-}
-
-
-// find least byte in a chunk with all bits set, and try unset it atomically
-// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
-// todo: try neon version
-static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  while(true) {
-    const __m256i vec  = _mm256_load_si256((const __m256i*)chunk->bfields);
-    const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1  : 0)
-    const uint32_t mask = _mm256_movemask_epi8(vcmp);    // mask of most significant bit of each byte
-    if (mask == 0) return false;
-    const size_t i = _tzcnt_u32(mask);
-    mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS);
-    const size_t chunk_idx = i / MI_BFIELD_SIZE;
-    const size_t byte_idx  = i % MI_BFIELD_SIZE;
-    if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) {  // try to unset atomically
-      *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8);
-      mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-      return true;
-    }
-    // try again
-  }
-  #else
-    size_t idx;
-    for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-      const mi_bfield_t x = chunk->bfields[i];
-      // has_set8 has low bit in each byte set if the byte in x == 0xFF
-      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
-                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
-                                    >> 7;                           // shift high bit to low bit
-      size_t idx;
-      if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
-        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-        mi_assert_internal((idx%8)==0);
-        const size_t byte_idx = idx/8;
-        if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) {  // unset the byte atomically
-          *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS);
-          return true;
-        }
-        // else continue
-      }
-    }
-    return false;
-  #endif
-}
-
-
-// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically
-// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success.
-// todo: try avx2 and neon version
-// todo: allow spanning across bfield boundaries?
-static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) {
-  if (n == 0 || n > MI_BFIELD_BITS) return false;  // TODO: allow larger?
-  const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1);
-  for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    mi_bfield_t b = chunk->bfields[i];
-    size_t bshift = 0;
-    size_t idx;
-    while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      b >>= idx;
-      bshift += idx;
-      if (bshift + n >= MI_BFIELD_BITS) break;
-
-      if ((b&mask) == mask) { // found a match
-        mi_assert_internal( ((mask << bshift) >> bshift) == mask );
-        if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<<bshift)) {
-          *pidx = (i*MI_BFIELD_BITS) + bshift;
-          mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS);
-          mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS);
-          return true;
-        }
-        else {
-          // if failed to atomically commit, try again from this position
-          b = (chunk->bfields[i] >> bshift);
-        }
-      }
-      else {
-        // advance
-        const size_t ones = mi_bfield_ctz(~b);      // skip all ones (since it didn't fit the mask)
-        mi_assert_internal(ones>0);
-        bshift += ones;
-        b >>= ones;
-      }
-    }
-  }
-  return false;
-}
-
-
-// are all bits in a bitmap chunk set?
-static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_test_all_ones(vec);
-  #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x & chunk->bfields[i];
-  }
-  return (~x == 0);
-  #endif
-}
-
-// are all bits in a bitmap chunk clear?
-static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
-  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
-  return _mm256_testz_si256( vec, vec );
-  #else
-  // written like this for vectorization
-  mi_bfield_t x = chunk->bfields[0];
-  for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) {
-    x = x | chunk->bfields[i];
-  }
-  return (x == 0);
-  #endif
-}
-
-/* --------------------------------------------------------------------------------
- bitmap
--------------------------------------------------------------------------------- */
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) {
-  if (!already_zero) {
-    _mi_memzero_aligned(bitmap, sizeof(*bitmap));
-  }
-}
-
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS);
-
-  // first chunk
-  size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  size_t m = MI_BITMAP_CHUNK_BITS - cidx;
-  if (m > n) { m = n; }
-  bool already_xset;
-  mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset);
-
-  // n can be large so use memset for efficiency for all in-between chunks
-  chunk_idx++;
-  n -= m;
-  const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS;
-  if (mid_chunks > 0) {
-    _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8);
-    chunk_idx += mid_chunks;
-    n -= mid_chunks * MI_BITMAP_CHUNK_BITS;
-  }
-
-  // last chunk
-  if (n > 0) {
-    mi_assert_internal(n < MI_BITMAP_CHUNK_BITS);
-    mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS);
-    mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset);
-  }
-}
-
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0),
-// and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx      = idx % MI_BITMAP_CHUNK_BITS;
-  return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx);
-}
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) {
-  mi_assert_internal(idx < MI_BITMAP_MAX_BITS);
-  mi_assert_internal(idx%8 == 0);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t byte_idx  = (idx % MI_BITMAP_CHUNK_BITS)/8;
-  return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx);
-}
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); }
-  if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); }
-
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n);
-}
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  bool local_already_xset;
-  if (already_xset==NULL) { already_xset = &local_already_xset;  }
-  // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); }
-  // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); }
-
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset);
-}
-
-// Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) {
-  mi_assert_internal(n>0);
-  mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS);
-  mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS);
-  const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS;
-  const size_t cidx = idx % MI_BITMAP_CHUNK_BITS;
-  mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS);  // don't cross chunks (for now)
-  if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; }  // paranoia
-  return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n);
-}
-
-
-#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
-  { size_t _set_idx; \
-    size_t _start = start % MI_BFIELD_BITS; \
-    mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
-    while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
-      decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
-
-#define mi_bitmap_forall_set_chunks_end() \
-      _start += _set_idx+1;    /* so chunk_idx stays valid */ \
-      _any_set >>= _set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ \
-      _any_set >>= 1; \
-    } \
-  }
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS);
-      return true;
-    }
-    else {
-      // we may find that all are unset only on a second iteration but that is ok as
-      // _any_set is a conservative approximation.
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
-
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8);
-      mi_assert_internal((*pidx % 8) == 0);
-      return true;
-    }
-    else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
-  // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
-  // TODO: allow spanning across chunk boundaries
-  if (n == 0 || n > MI_BFIELD_BITS) return false;
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
-      *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n);
-      return true;
-    }
-    else {
-      if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
-        mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx);
-      }
-    }
-  }
-  mi_bitmap_forall_set_chunks_end();
-  return false;
-}
diff --git a/src/xbitmap.h b/src/xbitmap.h
deleted file mode 100644
index 869db2a2..00000000
--- a/src/xbitmap.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019-2023 Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
------------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-Concurrent bitmap that can set/reset sequences of bits atomically
----------------------------------------------------------------------------- */
-#pragma once
-#ifndef MI_XBITMAP_H
-#define MI_XBITMAP_H
-
-/* --------------------------------------------------------------------------------
-  Definitions
--------------------------------------------------------------------------------- */
-
-typedef size_t mi_bfield_t;
-
-#define MI_BFIELD_BITS_SHIFT               (MI_SIZE_SHIFT+3)
-#define MI_BFIELD_BITS                     (1 << MI_BFIELD_BITS_SHIFT)
-#define MI_BFIELD_SIZE                     (MI_BFIELD_BITS/8)
-#define MI_BFIELD_BITS_MOD_MASK            (MI_BFIELD_BITS - 1)
-#define MI_BFIELD_LO_BIT8                  ((~(mi_bfield_t(0)))/0xFF)         // 0x01010101 ..
-#define MI_BFIELD_HI_BIT8                  (MI_BFIELD_LO_BIT8 << 7)           // 0x80808080 ..
-
-#define MI_BITMAP_CHUNK_BITS_SHIFT         (8)                                // 2^8 = 256 bits per chunk
-#define MI_BITMAP_CHUNK_BITS               (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
-#define MI_BITMAP_CHUNK_FIELDS             (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS)
-#define MI_BITMAP_CHUNK_BITS_MOD_MASK      (MI_BITMAP_CHUNK_BITS - 1)
-
-typedef mi_decl_align(32) struct mi_bitmap_chunk_s {
-  _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS];
-} mi_bitmap_chunk_t;
-
-
-typedef mi_decl_align(32) struct mi_bitmap_s {
-  mi_bitmap_chunk_t chunks[MI_BFIELD_BITS];
-  _Atomic(mi_bfield_t)any_set;
-} mi_bitmap_t;
-
-#define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
-
-/* --------------------------------------------------------------------------------
-  Bitmap
--------------------------------------------------------------------------------- */
-
-typedef bool  mi_bit_t;
-#define MI_BIT_SET    (true)
-#define MI_BIT_CLEAR  (false)
-
-// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true
-void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero);
-
-// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread.
-void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's).
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. 
-bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset);
-
-// Is a sequence of n bits already all set/cleared?
-bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0)
-// and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0)
-// and false otherwise leaving the bitmask as is.
-mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx);
-
-// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's)
-// and false otherwise leaving the bitmask as is.
-// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)!
-mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n);
-
-// Find a set bit in a bitmap and atomically unset it. Returns true on success,
-// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
-// The low `MI_BFIELD_BITS` of start are used to set the start point of the search
-// (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
-
-// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
-
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
-
-#endif // MI_XBITMAP_H