From 46afcbe06cd0000eeda5400fba7eb23453237b8c Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 29 Nov 2024 14:28:34 -0800
Subject: [PATCH] wip: further progress on segment removal; arena allocation

---
 include/mimalloc/internal.h |   7 +-
 include/mimalloc/types.h    |  17 +-
 src/arena-page.c            |  20 ++
 src/arena.c                 | 368 ++++++++++++++++++++++++++----------
 src/bitmap.c                |  16 +-
 src/bitmap.h                |   6 +-
 src/page-map.c              |   8 +-
 src/page.c                  |  56 +++---
 8 files changed, 344 insertions(+), 154 deletions(-)
 create mode 100644 src/arena-page.c

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index 2713c0ac..d60b0c15 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -137,6 +137,9 @@ bool       _mi_arena_contains(const void* p);
 void       _mi_arenas_collect(bool force_purge, mi_stats_t* stats);
 void       _mi_arena_unsafe_destroy_all(mi_stats_t* stats);
 
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment);
+void       _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld);
+void       _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld);
 
 void*      _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid);
 void       _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size);
@@ -181,6 +184,7 @@ void       _mi_deferred_free(mi_heap_t* heap, bool force);
 
 void       _mi_page_free_collect(mi_page_t* page,bool force);
 void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+void       _mi_page_init(mi_heap_t* heap, mi_page_t* page);
 
 size_t     _mi_bin_size(uint8_t bin);           // for stats
 uint8_t    _mi_bin(size_t size);                // for stats
@@ -453,8 +457,7 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
 
 // Page start
 static inline uint8_t* mi_page_start(const mi_page_t* page) {
-  mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE);
-  return (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  return page->page_start;
 }
 
 static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) {
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 98664020..591cb603 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -127,8 +127,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_ARENA_BLOCK_ALIGN              (MI_ARENA_BLOCK_SIZE)
 #define MI_BITMAP_CHUNK_BITS              (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT)
 
-#define MI_ARENA_MIN_OBJ_SIZE             MI_ARENA_BLOCK_SIZE
-#define MI_ARENA_MAX_OBJ_SIZE             (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE)  // for now, cannot cross chunk boundaries
+#define MI_ARENA_MIN_OBJ_BLOCKS           (1) 
+#define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
+
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE)  
 
 #define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
@@ -141,7 +144,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)
 
 
-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages
+// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages
 #define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_BLOCK_ALIGN)
 
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
@@ -279,7 +282,6 @@ typedef struct mi_subproc_s mi_subproc_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  mi_memid_t            memid;             // provenance of the page memory
   uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
   uint16_t              reserved;          // number of blocks reserved in memory
   mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
@@ -293,6 +295,7 @@ typedef struct mi_page_s {
   uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
                                            // padding
   size_t                block_size;        // size available in each block (always `>0`)  
+  uint8_t*              page_start;        // start of the blocks
 
   #if (MI_ENCODE_FREELIST || MI_PADDING)
   uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
@@ -304,6 +307,7 @@ typedef struct mi_page_s {
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+  mi_memid_t            memid;             // provenance of the page memory
 } mi_page_t;
 
 
@@ -312,7 +316,7 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     (64)
-#define MI_PAGE_INFO_SIZE                 (MI_SIZE_SHIFT*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+#define MI_PAGE_INFO_SIZE                 (2*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
@@ -532,7 +536,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 // ------------------------------------------------------
 
 struct mi_subproc_s {
-  _Atomic(size_t)    abandoned_count;         // count of abandoned pages for this sub-process
+  _Atomic(size_t)    abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process
   _Atomic(size_t)    abandoned_os_list_count; // count of abandoned pages in the os-list
   mi_lock_t          abandoned_os_lock;       // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations)
   mi_lock_t          abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list
@@ -562,6 +566,7 @@ struct mi_tld_s {
   mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
   mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
   mi_subproc_t*       subproc;       // sub-process this thread belongs to.
+  size_t              tseq;          // thread sequence id
   mi_os_tld_t         os;            // os tld
   mi_stats_t          stats;         // statistics
 };
diff --git a/src/arena-page.c b/src/arena-page.c
new file mode 100644
index 00000000..93d25dbf
--- /dev/null
+++ b/src/arena-page.c
@@ -0,0 +1,20 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2024, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "bitmap.h"
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
diff --git a/src/arena.c b/src/arena.c
index 28ad61f1..c9f8400b 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -42,6 +42,7 @@ typedef struct mi_arena_s {
   bool                is_large;             // memory area consists of large- or huge OS pages (always committed)
   mi_lock_t           abandoned_visit_lock; // lock is only used when abandoned segments are being visited
   _Atomic(mi_msecs_t) purge_expire;         // expiration time when blocks should be decommitted from `blocks_decommit`.
+  mi_subproc_t*       subproc;
 
   mi_bitmap_t         blocks_free;          // is the block free?
   mi_bitmap_t         blocks_committed;     // is the block committed? (i.e. accessible)
@@ -99,6 +100,9 @@ mi_arena_t* mi_arena_from_index(size_t idx) {
   return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
 }
 
+mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
+  return mi_arena_from_index(mi_arena_id_index(id));
+}
 
 
 /* -----------------------------------------------------------
@@ -164,14 +168,11 @@ bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block
   Arena Allocation
 ----------------------------------------------------------- */
 
-static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-  bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  MI_UNUSED(arena_index);
-  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
-
+static mi_decl_noinline void* mi_arena_try_alloc_at(
+  mi_arena_t* arena, size_t needed_bcount, bool commit, size_t tseq, mi_memid_t* memid)
+{  
   size_t block_index;
-  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL;
+  if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, needed_bcount, tseq, &block_index)) return NULL;
 
   // claimed it!
   void* p = mi_arena_block_start(arena, block_index);
@@ -192,7 +193,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed);
     if (!all_already_committed) {
       bool commit_zero = false;
-      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) {
+      if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, NULL)) {
         memid->initially_committed = false;
       }
       else {
@@ -205,75 +206,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount);
   }
 
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, needed_bcount));
+  if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount)); }
+  mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount));
+  // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, needed_bcount));
+
   return p;
 }
 
-// allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node,
-  size_t size, size_t alignment,
-  bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t bcount = mi_block_count_of_size(size);
-  const size_t arena_index = mi_arena_id_index(arena_id);
-  mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
-  mi_assert_internal(size <= mi_size_of_blocks(bcount));
-
-  // Check arena suitability
-  mi_arena_t* arena = mi_arena_from_index(arena_index);
-  if (arena == NULL) return NULL;
-  if (!allow_large && arena->is_large) return NULL;
-  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
-  if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity
-    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
-    if (match_numa_node) { if (!numa_suitable) return NULL; }
-    else { if (numa_suitable) return NULL; }
-  }
-
-  // try to allocate
-  void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld);
-  mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment));
-  return p;
-}
-
-
-// allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
-  bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld)
-{
-  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
-  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
-
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
-  if mi_likely(max_arena == 0) return NULL;
-
-  if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested
-    if (mi_arena_id_index(req_arena_id) < max_arena) {
-      void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-  }
-  else {
-    // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {
-      void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-      if (p != NULL) return p;
-    }
-
-    // try from another numa node instead..
-    if (numa_node >= 0) {  // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already
-      for (size_t i = 0; i < max_arena; i++) {
-        void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-        if (p != NULL) return p;
-      }
-    }
-  }
-  return NULL;
-}
 
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
@@ -323,56 +263,286 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
 }
 
 
-void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
-  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
+
+
+/* -----------------------------------------------------------
+  Arena iteration
+----------------------------------------------------------- */
+
+static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, mi_subproc_t* subproc, int numa_node, bool allow_large) {
+  if (subproc != NULL && arena->subproc != subproc) return false;
+  if (!allow_large && arena->is_large) return false;
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return false;
+  if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity
+    const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
+    if (!numa_suitable) return false;
+  }
+  return true;
+}
+
+#define MI_THREADS_PER_ARENA  (16)
+
+#define mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, var_arena_id, var_arena) \
+  { \
+  size_t _max_arena; \
+  size_t _start; \
+  if (req_arena_id == _mi_arena_id_none()) { \
+    _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \
+    _start = (_max_arena <= 1 ? 0 : (tseq / MI_THREADS_PER_ARENA) % _max_arena); \
+  } \
+  else { \
+    _max_arena = 1; \
+    _start = mi_arena_id_index(req_arena_id); \
+    mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \
+  } \
+  for (size_t i = 0; i < _max_arena; i++) { \
+    size_t _idx = i + _start; \
+    if (_idx >= _max_arena) { _idx -= _max_arena; } \
+    const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); \
+    mi_arena_t* const   var_arena = mi_arena_from_index(_idx); \
+    if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \
+    {
+
+#define mi_forall_arenas_end()  }}} 
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+// allocate blocks from the arenas
+static mi_decl_noinline void* mi_arena_try_find_free(
+  size_t block_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert_internal(memid != NULL && tld != NULL);
-  mi_assert_internal(size > 0);
-  size_t tseq = _mi_thread_seq_id();
-  *memid = _mi_memid_none();
+  mi_assert_internal(block_count <= mi_block_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+  if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL;
 
-  const int numa_node = _mi_os_numa_node(tld); // current numa node
+  // search arena's
+  mi_subproc_t* const subproc = tld->subproc;
+  const size_t tseq = tld->tseq;
+  mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
+  {
+    void* p = mi_arena_try_alloc_at(arena, block_count, commit, tseq, memid);
+    if (p != NULL) return p;
+  }
+  mi_forall_arenas_end();
+  return NULL;
+}
 
-  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
-  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) {  // is arena allocation allowed?
-    if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) {
-      void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
+// Allocate blocks from the arena's -- potentially allocating a fresh arena
+static mi_decl_noinline void* mi_arena_try_alloc(
+  size_t block_count, size_t alignment,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
+  mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS);
+  mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN);
+
+  void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+  if (p != NULL) return p;
+
+  // otherwise, try to first eagerly reserve a new arena
+  if (req_arena_id == _mi_arena_id_none()) {
+    mi_arena_id_t arena_id = 0;
+    if (mi_arena_reserve(mi_size_of_blocks(block_count), allow_large, req_arena_id, &arena_id)) {
+      // and try allocate in there
+      mi_assert_internal(req_arena_id == _mi_arena_id_none());
+      p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
-
-      // otherwise, try to first eagerly reserve a new arena
-      if (req_arena_id == _mi_arena_id_none()) {
-        mi_arena_id_t arena_id = 0;
-        if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
-          // and try allocate in there
-          mi_assert_internal(req_arena_id == _mi_arena_id_none());
-          p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld);
-          if (p != NULL) return p;
-        }
-      }
     }
   }
+}
 
+// Allocate from the OS (if allowed)
+static void* mi_arena_os_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
   // if we cannot use OS allocation, return NULL
   if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) {
     errno = ENOMEM;
     return NULL;
   }
 
-  // finally, fall back to the OS
   if (align_offset > 0) {
-    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, &tld->stats);
   }
   else {
-    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
+    return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, &tld->stats);
   }
 }
 
+
+// Allocate large sized memory
+void* _mi_arena_alloc_aligned(
+  size_t size, size_t alignment, size_t align_offset,
+  bool commit, bool allow_large,
+  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
+{
+  mi_assert_internal(memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+
+  // *memid = _mi_memid_none();
+  // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
+      req_arena_id == _mi_arena_id_none() &&                   // not a specific arena?
+      size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE &&  // and not too small/large
+      alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0)            // and good alignment
+  {
+    const size_t block_count = mi_block_count_of_size(size);
+    void* p = mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, memid, tld);
+    if (p != NULL) return p;
+  }
+
+  // fall back to the OS
+  return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld);
+}
+
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
 {
   return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld);
 }
 
 
+
+/* -----------------------------------------------------------
+  Arena page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+{
+  const size_t bin = _mi_bin(block_size); 
+  mi_assert_internal(bin < MI_BIN_COUNT);
+
+  // any abandoned in our size class?
+  mi_subproc_t* const subproc = tld->subproc;
+  if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL;
+
+  // search arena's
+  const bool allow_large = true;
+  size_t tseq = tld->tseq;
+  mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena)
+  {
+    size_t block_index;
+    if (mi_bitmap_try_find_and_clear(&arena->blocks_abandoned[bin], tseq, &block_index)) {
+      // found an abandoned page of the right size
+      mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]);
+      mi_page_t* page = (mi_page_t*)mi_arena_block_start(arena, block_index);
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, block_count));
+      mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, block_count));
+      mi_assert_internal(mi_page_block_size(page) == block_size);
+      mi_assert_internal(!mi_page_is_full(page));
+      mi_assert_internal(mi_page_is_abandoned(page));
+      return page;
+    }
+  }
+  mi_forall_arenas_end();
+  return false;
+}
+
+static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+{
+  const bool allow_large = true;
+  const bool commit = true;
+  const size_t alignment = MI_ARENA_BLOCK_ALIGN;
+
+  // try to allocate from free space in arena's
+  mi_memid_t memid;
+  mi_page_t* page = NULL;
+  if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
+    page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
+  }
+
+  // otherwise fall back to the OS
+  if (page == NULL) {
+    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_blocks(block_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+  }
+
+  if (page == NULL) return NULL;
+
+  // claimed free blocks: initialize the page partly
+  _mi_memzero_aligned(page, sizeof(*page));
+  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
+  const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size;
+  mi_assert_internal(reserved > 0 && reserved < UINT16_MAX);
+  page->reserved = reserved;
+  page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  page->block_size = block_size;
+  page->memid = memid;
+  page->free_is_zero = memid.initially_zero;
+  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
+    page->block_size_shift = (uint8_t)mi_ctz(block_size);
+  }
+  else {
+    page->block_size_shift = 0;
+  }
+
+  mi_assert_internal(mi_page_block_size(page) == block_size);
+  mi_assert_internal(mi_page_is_abandoned(page));
+  return page;
+}
+
+// block_count: arena block count for the page
+// block size : page block size
+static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) {
+  const size_t    req_arena_id = heap->arena_id;
+  mi_tld_t* const tld = heap->tld;
+
+  // 1. look for an abandoned page  
+  mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld);
+  if (page != NULL) {
+    _mi_page_reclaim(heap,page);
+    return page;
+  }
+
+  // 2. find a free block, potentially allocating a new arena
+  page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld);
+  if (page != NULL) {    
+    _mi_page_init(heap, page);
+    return page;
+  }
+
+  return NULL;
+}
+
+
+static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+  _mi_error_message(EINVAL, "singleton page is not yet implemented\n");
+  return NULL;
+}
+
+
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+  mi_page_t* page;
+  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  }
+  else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size);
+  }
+  else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) {
+    page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
+  }
+  else {
+    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  }  
+  // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
+  return page;
+}
+
+
 /* -----------------------------------------------------------
   Arena free
 ----------------------------------------------------------- */
diff --git a/src/bitmap.c b/src/bitmap.c
index 463d74c7..9faa9ae9 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -512,9 +512,9 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 }
 
 
-#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \
+#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \
   { size_t _set_idx; \
-    size_t _start = start % MI_BFIELD_BITS; \
+    size_t _start = tseq % MI_BFIELD_BITS; \
     mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \
     while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \
       decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS;
@@ -530,8 +530,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n)
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -554,8 +554,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t star
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) {
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) {
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) {
@@ -576,11 +576,11 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pi
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) {
+bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) {
   // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger
   // TODO: allow spanning across chunk boundaries
   if (n == 0 || n > MI_BFIELD_BITS) return false;
-  mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx)
+  mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx)
   {
     size_t cidx;
     if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) {
diff --git a/src/bitmap.h b/src/bitmap.h
index 198a2902..fcadc213 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -79,14 +79,14 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, si
 // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`.
 // The low `MI_BFIELD_BITS` of start are used to set the start point of the search
 // (to reduce thread contention).
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start);
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);
 
 // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx );
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx );
 
 // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx );
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
 
 #endif // MI_XBITMAP_H
diff --git a/src/page-map.c b/src/page-map.c
index d3fcef79..cb527886 100644
--- a/src/page-map.c
+++ b/src/page-map.c
@@ -32,9 +32,13 @@ static bool mi_page_map_init(void) {
     return false;
   }
   if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) {
-    _mi_warning_message("the page map was committed on-demand but not zero initialized!\n");
+    _mi_warning_message("the page map was committed but not zero initialized!\n");
     _mi_memzero_aligned(_mi_page_map, page_map_size);
   }
+  // commit the first part so NULL pointers get resolved without an access violation
+  if (!mi_page_map_all_committed) {
+    _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
+  }
   return true;
 }
 
@@ -72,7 +76,7 @@ void _mi_page_map_register(mi_page_t* page) {
   // set the offsets
   for (int i = 0; i < block_count; i++) {
     mi_assert_internal(i < 128);
-    _mi_page_map[idx + i] = (int8_t)(-i-1);
+    _mi_page_map[idx + i] = (signed char)(-i-1);
   }
 }
 
diff --git a/src/page.c b/src/page.c
index a00ff615..fa006085 100644
--- a/src/page.c
+++ b/src/page.c
@@ -119,7 +119,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
   mi_assert_internal(page->keys[0] != 0);
   #endif
   if (mi_page_heap(page)!=NULL) {
-    mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0);
+    mi_assert_internal(!_mi_process_is_initialized || mi_page_thread_id(page) == mi_page_heap(page)->thread_id || mi_page_thread_id(page)==0);
     {
       mi_page_queue_t* pq = mi_page_queue_of(page);
       mi_assert_internal(mi_page_queue_contains(pq, page));
@@ -249,19 +249,22 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
 
 // called from segments when reclaiming abandoned pages
 void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_page_set_heap(page, heap);
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) 
+  _mi_page_free_collect(page, false); // ensure used count is up to date
+
   mi_assert_expensive(mi_page_is_valid_init(page));
   mi_assert_internal(mi_page_heap(page) == heap);
   mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
-  #if MI_HUGE_PAGE_ABANDON
-  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
-  #endif
-
+  
   // TODO: push on full queue immediately if it is full?
   mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
   mi_page_queue_push(heap, pq, page);
   mi_assert_expensive(_mi_page_is_valid(page));
 }
 
+
+
 // allocate a fresh page from a segment
 static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
   #if !MI_HUGE_PAGE_ABANDON
@@ -269,16 +272,12 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
   mi_assert_internal(mi_heap_contains_queue(heap, pq));
   mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size);
   #endif
-  mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment);
+  mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment);
   if (page == NULL) {
     // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
     return NULL;
   }
   mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
-  // a fresh page was found, initialize it
-  const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
-  mi_assert_internal(full_block_size >= block_size);
-  mi_page_init(heap, page, full_block_size, heap->tld);
   mi_heap_stat_increase(heap, pages, 1);
   if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
   mi_assert_expensive(_mi_page_is_valid(page));
@@ -389,7 +388,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
 
   // and abandon it
   mi_assert_internal(mi_page_is_abandoned(page));
-  _mi_arena_page_abandon(page,&pheap->tld);
+  _mi_arena_page_abandon(page, pheap->tld);
 }
 
 // force abandon a page
@@ -432,7 +431,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
 
   // and free it
   mi_page_set_heap(page,NULL);
-  _mi_arena_page_free(page, force, &pheap->tld);
+  _mi_arena_page_free(page, pheap->tld);
 }
 
 #define MI_MAX_RETIRE_SIZE    MI_LARGE_OBJ_SIZE_MAX   // should be less than size for MI_BIN_HUGE
@@ -617,7 +616,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co
 // Note: we also experimented with "bump" allocation on the first
 // allocations but this did not speed up any benchmark (due to an
 // extra test in malloc? or cache effects?)
-static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) {
   mi_assert_expensive(mi_page_is_valid_init(page));
   #if (MI_SECURE<=2)
   mi_assert(page->free == NULL);
@@ -629,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
   size_t page_size;
   //uint8_t* page_start =
   mi_page_area(page, &page_size);
-  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+  mi_heap_stat_counter_increase(heap, pages_extended, 1);
 
   // calculate the extend count
   const size_t bsize = mi_page_block_size(page);
@@ -651,48 +650,37 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld)
 
   // and append the extend the free list
   if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
-    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+    mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats );
   }
   else {
-    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats);
   }
   // enable the new free list
   page->capacity += (uint16_t)extend;
-  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+  mi_heap_stat_increase(heap, page_committed, extend * bsize);
   mi_assert_expensive(mi_page_is_valid_init(page));
 }
 
-// Initialize a fresh page
-static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+// Initialize a fresh page (that is already partially initialized)
+void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
   mi_assert(page != NULL);
-  mi_assert_internal(block_size > 0);
-  // set fields
   mi_page_set_heap(page, heap);
-  page->block_size = block_size;
   size_t page_size;
   uint8_t* page_start = mi_page_area(page, &page_size);
   mi_track_mem_noaccess(page_start,page_size);
-  mi_assert_internal(page_size / block_size < (1L<<16));
-  page->reserved = (uint16_t)(page_size / block_size);
+  mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16));
   mi_assert_internal(page->reserved > 0);
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   page->keys[0] = _mi_heap_random_next(heap);
   page->keys[1] = _mi_heap_random_next(heap);
   #endif
-  page->free_is_zero = page->memid.initially_zero;
   #if MI_DEBUG>2
   if (page->memid.initially_zero) {
     mi_track_mem_defined(page->page_start, page_size);
     mi_assert_expensive(mi_mem_is_zero(page_start, page_size));
   }
   #endif
-  if (block_size > 0 && _mi_is_power_of_two(block_size)) {
-    page->block_size_shift = (uint8_t)mi_ctz(block_size);
-  }
-  else {
-    page->block_size_shift = 0;
-  }
-
+  
   mi_assert_internal(page->capacity == 0);
   mi_assert_internal(page->free == NULL);
   mi_assert_internal(page->used == 0);
@@ -705,11 +693,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
   mi_assert_internal(page->keys[0] != 0);
   mi_assert_internal(page->keys[1] != 0);
   #endif
-  mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift)));
+  mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift)));
   mi_assert_expensive(mi_page_is_valid_init(page));
 
   // initialize an initial free list
-  mi_page_extend_free(heap,page,tld);
+  mi_page_extend_free(heap,page);
   mi_assert(mi_page_immediate_available(page));
 }