From 7020ed5e5230b8cdab4dea033dd472841c6e65f7 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 11:26:03 -0800
Subject: [PATCH 01/17] do not purge if purge delay is negative

---
 src/arena.c   | 96 +++++++++++++++++++++++++--------------------------
 src/segment.c | 32 ++++++++---------
 2 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 0f71e978..42aac8fc 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -13,7 +13,7 @@ threads and need to be accessed using atomic operations.
 
 Arenas are used to for huge OS page (1GiB) reservations or for reserving
 OS memory upfront which can be improve performance or is sometimes needed
-on embedded devices. We can also employ this with WASI or `sbrk` systems 
+on embedded devices. We can also employ this with WASI or `sbrk` systems
 to reserve large arenas upfront and be able to reuse the memory more effectively.
 
 The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
@@ -48,13 +48,13 @@ typedef struct mi_arena_s {
   size_t   meta_size;                     // size of the arena structure itself (including its bitmaps)
   mi_memid_t meta_memid;                  // memid of the arena structure itself (OS or static allocation)
   int      numa_node;                     // associated NUMA node
-  bool     exclusive;                     // only allow allocations if specifically for this arena  
+  bool     exclusive;                     // only allow allocations if specifically for this arena
   bool     is_large;                      // memory area consists of large- or huge OS pages (always committed)
   _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
-  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.  
+  _Atomic(mi_msecs_t) purge_expire;       // expiration time when blocks should be decommitted from `blocks_decommit`.
   mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
   mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
-  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)  
+  mi_bitmap_field_t* blocks_purge;        // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted)
   mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;
 
@@ -94,13 +94,13 @@ bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_i
     return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id);
   }
   else {
-    return mi_arena_id_is_suitable(0, false, request_arena_id);
+    return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id);
   }
 }
 
 
 /* -----------------------------------------------------------
-  Arena allocations get a (currently) 16-bit memory id where the 
+  Arena allocations get a (currently) 16-bit memory id where the
   lower 8 bits are the arena id, and the upper bits the block index.
 ----------------------------------------------------------- */
 
@@ -208,7 +208,7 @@ static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index
 {
   size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
   if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
-    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around    
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
     return true;
   };
   return false;
@@ -228,7 +228,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
   mi_bitmap_index_t bitmap_index;
   if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL;
 
-  // claimed it! 
+  // claimed it!
   void* p = mi_arena_block_start(arena, bitmap_index);
   *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index);
   memid->is_pinned = arena->memid.is_pinned;
@@ -268,21 +268,21 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
     // no need to commit, but check if already fully committed
     memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
   }
-  
+
   return p;
 }
 
 // allocate in a speficic arena
-static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, 
-                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) 
+static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment,
+                                       bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
   MI_UNUSED_RELEASE(alignment);
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
-  const size_t bcount = mi_block_count_of_size(size);  
+  const size_t bcount = mi_block_count_of_size(size);
   const size_t arena_index = mi_arena_id_index(arena_id);
   mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count));
   mi_assert_internal(size <= mi_arena_block_size(bcount));
-  
+
   // Check arena suitability
   mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]);
   if (arena == NULL) return NULL;
@@ -302,7 +302,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no
 
 
 // allocate from an arena with fallback to the OS
-static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, 
+static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment,
                                                   bool commit, bool allow_large,
                                                   mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld )
 {
@@ -310,9 +310,9 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   if mi_likely(max_arena == 0) return NULL;
-  
+
   if (req_arena_id != _mi_arena_id_none()) {
-    // try a specific arena if requested 
+    // try a specific arena if requested
     if (mi_arena_id_index(req_arena_id) < max_arena) {
       void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
@@ -320,7 +320,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, siz
   }
   else {
     // try numa affine allocation
-    for (size_t i = 0; i < max_arena; i++) {    
+    for (size_t i = 0; i < max_arena; i++) {
       void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
       if (p != NULL) return p;
     }
@@ -348,22 +348,22 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve);
   if (arena_reserve == 0) return false;
 
-  if (!_mi_os_has_virtual_reserve()) { 
+  if (!_mi_os_has_virtual_reserve()) {
     arena_reserve = arena_reserve/4;  // be conservative if virtual reserve is not supported (for some embedded systems for example)
   }
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE);
   if (arena_count >= 8 && arena_count <= 128) {
     arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve;  // scale up the arena sizes exponentially
-  }    
+  }
   if (arena_reserve < req_size) return false;  // should be able to at least handle the current allocation size
-      
+
   // commit eagerly?
   bool arena_commit = false;
   if (mi_option_get(mi_option_arena_eager_commit) == 2)      { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
   return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive */, arena_id) == 0);
-}    
+}
 
 
 void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large,
@@ -378,9 +378,9 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
   // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
   if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
     void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld);
-    if (p != NULL) return p;    
+    if (p != NULL) return p;
 
-    // otherwise, try to first eagerly reserve a new arena 
+    // otherwise, try to first eagerly reserve a new arena
     if (req_arena_id == _mi_arena_id_none()) {
       mi_arena_id_t arena_id = 0;
       if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) {
@@ -397,14 +397,14 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset
     errno = ENOMEM;
     return NULL;
   }
-   
+
   // finally, fall back to the OS
   if (align_offset > 0) {
     return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats);
   }
   else {
     return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats);
-  }  
+  }
 }
 
 void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld)
@@ -440,22 +440,22 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks,
   mi_assert_internal(arena->blocks_purge != NULL);
   mi_assert_internal(!arena->memid.is_pinned);
   const size_t size = mi_arena_block_size(blocks);
-  void* const p = mi_arena_block_start(arena, bitmap_idx); 
+  void* const p = mi_arena_block_start(arena, bitmap_idx);
   bool needs_recommit;
   if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) {
     // all blocks are committed, we can purge freely
     needs_recommit = _mi_os_purge(p, size, stats);
   }
   else {
-    // some blocks are not committed -- this can happen when a partially committed block is freed 
+    // some blocks are not committed -- this can happen when a partially committed block is freed
     // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge
-    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), 
+    // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory),
     // and also undo the decommit stats (as it was already adjusted)
     mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits));
     needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats);
     _mi_stat_increase(&stats->committed, size);
   }
-  
+
   // clear the purged blocks
   _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx);
   // update committed bitmap
@@ -473,7 +473,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
 
   if (_mi_preloading() || delay == 0) {
     // decommit directly
-    mi_arena_purge(arena, bitmap_idx, blocks, stats);    
+    mi_arena_purge(arena, bitmap_idx, blocks, stats);
   }
   else {
     // schedule decommit
@@ -515,7 +515,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx,
 }
 
 // returns true if anything was purged
-static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) 
+static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats)
 {
   if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false;
   mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
@@ -524,10 +524,10 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
 
   // reset expire (if not already set concurrently)
   mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
-  
+
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;
-  bool full_purge = true;  
+  bool full_purge = true;
   for (size_t i = 0; i < arena->field_count; i++) {
     size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]);
     if (purge != 0) {
@@ -578,7 +578,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
 
   // allow only one thread to purge at a time
   static mi_atomic_guard_t purge_guard;
-  mi_atomic_guard(&purge_guard) 
+  mi_atomic_guard(&purge_guard)
   {
     mi_msecs_t now = _mi_clock_now();
     size_t max_purge_count = (visit_all ? max_arena : 1);
@@ -591,7 +591,7 @@ static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats )
         }
       }
     }
-  }  
+  }
 }
 
 
@@ -605,7 +605,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
   if (p==NULL) return;
   if (size==0) return;
   const bool all_committed = (committed_size == size);
-  
+
   if (mi_memkind_is_os(memid.memkind)) {
     // was a direct OS allocation, pass through
     if (!all_committed && committed_size > 0) {
@@ -623,7 +623,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]);
     mi_assert_internal(arena != NULL);
     const size_t blocks = mi_block_count_of_size(size);
-    
+
     // checks
     if (arena == NULL) {
       _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@@ -645,7 +645,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
     else {
       mi_assert_internal(arena->blocks_committed != NULL);
       mi_assert_internal(arena->blocks_purge != NULL);
-      
+
       if (!all_committed) {
         // mark the entire range as no longer committed (so we recommit the full range when re-using)
         _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
@@ -660,9 +660,9 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
         // works (as we should never reset decommitted parts).
       }
       // (delay) purge the entire range
-      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);      
+      mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats);
     }
-    
+
     // and make it available to others again
     bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
     if (!all_inuse) {
@@ -687,9 +687,9 @@ static void mi_arenas_unsafe_destroy(void) {
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL) {
-      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {      
+      if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) {
         mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL);
-        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); 
+        _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main);
       }
       else {
         new_max_arena = i;
@@ -712,7 +712,7 @@ void _mi_arena_collect(bool force_purge, mi_stats_t* stats) {
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) {
   mi_arenas_unsafe_destroy();
-  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas  
+  _mi_arena_collect(true /* force purge */, stats);  // purge non-owned arenas
 }
 
 // Is a pointer inside any of our arenas?
@@ -720,8 +720,8 @@ bool _mi_arena_contains(const void* p) {
   const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
-    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { 
-      return true;      
+    if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) {
+      return true;
     }
   }
   return false;
@@ -765,7 +765,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   mi_memid_t meta_memid;
   mi_arena_t* arena   = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
   if (arena == NULL) return false;
-  
+
   // already zero'd due to os_alloc
   // _mi_memzero(arena, asize);
   arena->id = _mi_arena_id_none();
@@ -782,12 +782,12 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
   arena->search_idx   = 0;
   arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
   arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
-  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap  
+  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap
   // initialize committed bitmap?
   if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
     memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
   }
-  
+
   // and claim leftover blocks if needed (so we never allocate there)
   ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
   mi_assert_internal(post >= 0);
diff --git a/src/segment.c b/src/segment.c
index 6798bb66..26862899 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -239,7 +239,7 @@ static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tl
   mi_assert_internal(page->used == 0);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
   size_t psize;
-  void* start = mi_segment_raw_page_start(segment, page, &psize);  
+  void* start = mi_segment_raw_page_start(segment, page, &psize);
   const bool needs_recommit = _mi_os_purge(start, psize, tld->stats);
   if (needs_recommit) { page->is_committed = false; }
   page->used = 0;
@@ -249,7 +249,7 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   if (page->is_committed) return true;
   mi_assert_internal(segment->allow_decommit);
   mi_assert_expensive(!mi_pages_purge_contains(page, tld));
-  
+
   size_t psize;
   uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
   bool is_zero = false;
@@ -259,8 +259,8 @@ static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_
   page->is_committed = true;
   page->used = 0;
   page->is_zero_init = is_zero;
-  if (gsize > 0) { 
-    mi_segment_protect_range(start + psize, gsize, true); 
+  if (gsize > 0) {
+    mi_segment_protect_range(start + psize, gsize, true);
   }
   return true;
 }
@@ -296,7 +296,7 @@ static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi
     // purge immediately?
     mi_page_purge(segment, page, tld);
   }
-  else {
+  else if (mi_option_get(mi_option_purge_delay) > 0) {   // no purging if the delay is negative
     // otherwise push on the delayed page reset queue
     mi_page_queue_t* pq = &tld->pages_purge;
     // push on top
@@ -484,11 +484,11 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
   for (size_t i = 0; i < segment->capacity; i++) {
     mi_page_t* page = &segment->pages[i];
     if (page->is_committed)  { committed_size += page_size;  }
-    if (!page->is_committed) { fully_committed = false; }    
+    if (!page->is_committed) { fully_committed = false; }
   }
   MI_UNUSED(fully_committed);
   mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size));
-    
+
   _mi_abandoned_await_readers(); // prevent ABA issue if concurrent readers try to access our memory (that might be purged)
   _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats);
 }
@@ -536,9 +536,9 @@ static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignme
       // commit failed; we cannot touch the memory: free the segment directly and return `NULL`
       _mi_arena_free(segment, segment_size, 0, memid, tld_os->stats);
       return NULL;
-    }    
+    }
   }
-  
+
   MI_UNUSED(info_size);
   segment->memid = memid;
   segment->allow_decommit = !memid.is_pinned;
@@ -581,7 +581,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
                               tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay));
   const bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
   const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE);
-  
+
   // Allocate the segment from the OS (segment_size can change due to alignment)
   mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld, os_tld);
   if (segment == NULL) return NULL;
@@ -609,7 +609,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
   segment->segment_info_size = pre_size;
   segment->thread_id  = _mi_thread_id();
   segment->cookie = _mi_ptr_cookie(segment);
-  
+
   // set protection
   mi_segment_protect(segment, true, tld->os);
 
@@ -628,7 +628,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t
   // don't purge as we are freeing now
   mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld);
   mi_segment_remove_from_free_queue(segment, tld);
-  
+
   mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
   mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
   mi_assert(segment->next == NULL);
@@ -655,10 +655,10 @@ static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 
   // check commit
   if (!mi_page_ensure_committed(segment, page, tld)) return false;
-  
+
   // set in-use before doing unreset to prevent delayed reset
   page->segment_in_use = true;
-  segment->used++;  
+  segment->used++;
   mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld));
   mi_assert_internal(segment->used <= segment->capacity);
   if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) {
@@ -1134,7 +1134,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
 
 static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(mi_segment_has_free(segment));
-  mi_assert_expensive(mi_segment_is_valid(segment, tld));  
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
   for (size_t i = 0; i < segment->capacity; i++) {  // TODO: use a bitmap instead of search?
     mi_page_t* page = &segment->pages[i];
     if (!page->segment_in_use) {
@@ -1274,7 +1274,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_bloc
   mi_assert_internal(page->free == NULL);
   if (segment->allow_decommit && page->is_committed) {
     size_t usize = mi_usable_size(block);
-    if (usize > sizeof(mi_block_t)) { 
+    if (usize > sizeof(mi_block_t)) {
       usize = usize - sizeof(mi_block_t);
       uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
       _mi_os_reset(p, usize, &_mi_stats_main);

From 8f353d8005c919d609e95003527c826c8c4e8310 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Thu, 29 Feb 2024 12:03:28 -0800
Subject: [PATCH 02/17] set initially_zero for arena_static_zalloc

---
 src/arena.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/arena.c b/src/arena.c
index 42aac8fc..790b6ac1 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -162,6 +162,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m
 
   // success
   *memid = _mi_memid_create(MI_MEM_STATIC);
+  memid->initially_zero = true;
   const size_t start = _mi_align_up(oldtop, alignment);
   uint8_t* const p = &mi_arena_static[start];
   _mi_memzero(p, size);
@@ -179,8 +180,10 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st
   p = _mi_os_alloc(size, memid, stats);
   if (p == NULL) return NULL;
 
+  // zero the OS memory if needed
   if (!memid->initially_zero) {
     _mi_memzero_aligned(p, size);
+    memid->initially_zero = true;
   }
   return p;
 }

From 3966953b7f0f11d2ec33097c5da4356d5b7db7e8 Mon Sep 17 00:00:00 2001
From: Daan Leijen <daan@microsoft.com>
Date: Sat, 2 Mar 2024 11:50:57 -0800
Subject: [PATCH 03/17] prefer using __builtin_thread_pointer over assembly
 primitives. Fixes #851 and #852 as well.

---
 CMakeLists.txt          |  4 +--
 include/mimalloc/prim.h | 71 ++++++++++++++++++++++++++---------------
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1387e0db..bd98019b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,7 @@ endif()
 
 if(MI_SECURE)
   message(STATUS "Set full secure build (MI_SECURE=ON)")
-  list(APPEND mi_defines MI_SECURE=4)  
+  list(APPEND mi_defines MI_SECURE=4)
 endif()
 
 if(MI_TRACK_VALGRIND)
@@ -468,7 +468,7 @@ if (MI_BUILD_OBJECT)
     set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}")
     set(mimalloc-obj-out    "${CMAKE_CURRENT_BINARY_DIR}/${mi_basename}${CMAKE_C_OUTPUT_EXTENSION}")
     add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}")
-    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})      
+    add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out})
   endif()
 
   # the following seems to lead to cmake warnings/errors on some systems, disable for now :-(
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 9e560696..c3844d8b 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -35,10 +35,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config );
 
 // Free OS memory
 int _mi_prim_free(void* addr, size_t size );
-  
+
 // Allocate OS memory. Return NULL on error.
 // The `try_alignment` is just a hint and the returned pointer does not have to be aligned.
-// If `commit` is false, the virtual memory range only needs to be reserved (with no access) 
+// If `commit` is false, the virtual memory range only needs to be reserved (with no access)
 // which will later be committed explicitly using `_mi_prim_commit`.
 // `is_zero` is set to true if the memory was zero initialized (as on most OS's)
 // pre: !commit => !allow_large
@@ -82,11 +82,11 @@ mi_msecs_t _mi_prim_clock_now(void);
 typedef struct mi_process_info_s {
   mi_msecs_t  elapsed;
   mi_msecs_t  utime;
-  mi_msecs_t  stime; 
-  size_t      current_rss; 
-  size_t      peak_rss;  
+  mi_msecs_t  stime;
+  size_t      current_rss;
+  size_t      peak_rss;
   size_t      current_commit;
-  size_t      peak_commit; 
+  size_t      peak_commit;
   size_t      page_faults;
 } mi_process_info_t;
 
@@ -117,7 +117,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 //-------------------------------------------------------------------
 // Thread id: `_mi_prim_thread_id()`
-// 
+//
 // Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms as
 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
@@ -125,26 +125,14 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // for each thread (unequal to zero).
 //-------------------------------------------------------------------
 
-// defined in `init.c`; do not use these directly
-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
-extern bool _mi_process_is_initialized;             // has mi_process_init been called?
-
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
-
-#if defined(_WIN32)
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
-  // Windows: works on Intel and ARM in both 32- and 64-bit
-  return (uintptr_t)NtCurrentTeb();
-}
-
-// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
-// both the OS and libc implementation so we use specific tests for each main platform.
+// On some libc + platform combinations we can directly access a thread-local storage (TLS) slot.
+// The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform.
 // If you test on another platform and it works please send a PR :-)
 // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
-#elif defined(__GNUC__) && ( \
+//
+// Note: on most platforms this is not actually used anymore as we prefer `__builtin_thread_pointer()` nowadays.
+// However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
+#if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
@@ -152,6 +140,8 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
       )
 
+#define MI_HAS_TLS_SLOT
+
 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
   void* res;
   const size_t ofs = (slot*sizeof(void*));
@@ -205,6 +195,33 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
   #endif
 }
 
+#endif
+
+// defined in `init.c`; do not use these directly
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+extern bool _mi_process_is_initialized;             // has mi_process_init been called?
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
+
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+#elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+      (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
+
+static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
+  // Works on most Unix based platforms
+  return (uintptr_t)__builtin_thread_pointer();
+}
+
+#elif defined(MI_HAS_TLS_SLOT)
+
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   #if defined(__BIONIC__)
     // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
@@ -251,7 +268,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 #if defined(MI_MALLOC_OVERRIDE)
 #if defined(__APPLE__) // macOS
   #define MI_TLS_SLOT               89  // seems unused?
-  // #define MI_TLS_RECURSE_GUARD 1
   // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
   // see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
 #elif defined(__OpenBSD__)
@@ -269,6 +285,9 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 
 
 #if defined(MI_TLS_SLOT)
+# if !defined(MI_HAS_TLS_SLOT)
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined
+# endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {
   mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT);

From dfb5cadf33437aba9372e0d580022101deaee1d6 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:06:34 -0800
Subject: [PATCH 04/17] don't use the new __builtin_thread_pointer on macOS

---
 include/mimalloc/prim.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index c3844d8b..830b36c4 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -213,11 +213,12 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
 }
 
 #elif defined(__has_builtin) && __has_builtin(__builtin_thread_pointer) && \
+      (!defined(__APPLE__)) && /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly (<https://github.com/microsoft/mimalloc/issues/343#issuecomment-763272369>)*/ \
       (!defined(__clang_major__) || __clang_major__ >= 14)  // older clang versions emit bad code; fall back to using the TLS slot (<https://lore.kernel.org/linux-arm-kernel/202110280952.352F66D8@keescook/T/>)
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept {
   // Works on most Unix based platforms
-  return (uintptr_t)__builtin_thread_pointer();
+  return (uintptr_t)__builtin_thread_pointer();  
 }
 
 #elif defined(MI_HAS_TLS_SLOT)

From 1f2d799ed0684e94489802502a709822581a537d Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:14:59 -0800
Subject: [PATCH 05/17] possible fix for #855

---
 src/segment-map.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/segment-map.c b/src/segment-map.c
index 4c2104bd..a306ec67 100644
--- a/src/segment-map.c
+++ b/src/segment-map.c
@@ -29,6 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
 
 static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  // note: segment can be invalid or NULL.
   mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
   if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
     *bitidx = 0;
@@ -70,8 +71,7 @@ void _mi_segment_map_freed_at(const mi_segment_t* segment) {
 // Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
 static mi_segment_t* _mi_segment_of(const void* p) {
   if (p == NULL) return NULL;
-  mi_segment_t* segment = _mi_ptr_segment(p);
-  mi_assert_internal(segment != NULL);
+  mi_segment_t* segment = _mi_ptr_segment(p);  // segment can be NULL  
   size_t bitidx;
   size_t index = mi_segment_map_index_of(segment, &bitidx);
   // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge

From 89afa14045b9bceac4e93cb54ea02799ebc57f45 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:25:16 -0800
Subject: [PATCH 06/17] fix build on illumos; by @dancrossnyc, issue #841

---
 src/prim/unix/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 54bf57b2..d99f6097 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -310,7 +310,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
       #elif defined(__sun)
       if (allow_large && _mi_os_use_large_page(size, try_alignment)) {
         struct memcntl_mha cmd = {0};
-        cmd.mha_pagesize = large_os_page_size;
+        cmd.mha_pagesize = _mi_os_large_page_size();
         cmd.mha_cmd = MHA_MAPSIZE_VA;
         if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
           *is_large = true;

From bccf10e1643bb25fce0d83f72e531752a07585e8 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 14:49:37 -0800
Subject: [PATCH 07/17] allow random fallback on older macOS versions, issue
 #829

---
 src/prim/unix/prim.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index d99f6097..91fa6508 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -37,6 +37,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/mman.h>
   #endif
 #elif defined(__APPLE__)
+  #include <AvailabilityMacros.h>
   #include <TargetConditionals.h>
   #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
   #include <mach/vm_statistics.h>
@@ -55,12 +56,14 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/syscall.h>
 #endif
 
+
 //------------------------------------------------------------------------------------
 // Use syscalls for some primitives to allow for libraries that override open/read/close etc.
 // and do allocation themselves; using syscalls prevents recursion when mimalloc is 
 // still initializing (issue #713)
 //------------------------------------------------------------------------------------
 
+
 #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access)
 
 static int mi_prim_open(const char* fpath, int open_flags) {
@@ -76,7 +79,9 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif !defined(__APPLE__)  // avoid unused warnings
+#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
+
+#include <fcntl.h>
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
@@ -731,28 +736,20 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
 // Random
 //----------------------------------------------------------------
 
-#if defined(__APPLE__)
-
-#include <AvailabilityMacros.h>
-#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070
 #include <CommonCrypto/CommonCryptoError.h>
 #include <CommonCrypto/CommonRandom.h>
-#endif
+
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
-  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
-    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
-    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
-    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
-  #else
-    // fall back on older macOS
-    arc4random_buf(buf, buf_len);
-    return true;
-  #endif
+  // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+  // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+  return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);  
 }
 
 #elif defined(__ANDROID__) || defined(__DragonFly__) || \
       defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__sun) 
+      defined(__sun) || \
+      (defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 && MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)
 
 #include <stdlib.h>
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {
@@ -760,7 +757,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
   return true;
 }
 
-#elif defined(__linux__) || defined(__HAIKU__)
+#elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__)   // for old apple versions < 1070 (issue #829)
 
 #include <sys/types.h>
 #include <sys/stat.h>

From d21f60f71281fdc0dcabb77be01d4d897eb09ca0 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:00:31 -0800
Subject: [PATCH 08/17] add emscripten WASM support; this PR #822 written by
 Alon Zakai @kripken

---
 src/prim/emscripten/prim.c | 251 +++++++++++++++++++++++++++++++++++++
 src/prim/prim.c            |   3 +
 2 files changed, 254 insertions(+)
 create mode 100644 src/prim/emscripten/prim.c

diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c
new file mode 100644
index 00000000..c0fa0f4a
--- /dev/null
+++ b/src/prim/emscripten/prim.c
@@ -0,0 +1,251 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// This file is included in `src/prim/prim.c`
+
+#include "mimalloc.h"
+#include "mimalloc/internal.h"
+#include "mimalloc/atomic.h"
+#include "mimalloc/prim.h"
+
+// Design
+// ======
+//
+// mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top
+// of sbrk. The reason for having three layers here is that we want mimalloc to
+// be able to allocate and release system memory properly, the same way it would
+// when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited.
+// Specifically, sbrk can only go up and down, and not "skip" over regions, and
+// so we end up either never freeing memory to the system, or we can get stuck
+// with holes.
+//
+// Atm wasm generally does *not* free memory back the system: once grown, we do
+// not shrink back down (https://github.com/WebAssembly/design/issues/1397).
+// However, that is expected to improve
+// (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md)
+// and so we do not want to bake those limitations in here.
+//
+// Even without that issue, we want our system allocator to handle holes, that
+// is, it should merge freed regions and allow allocating new content there of
+// the full size, etc., so that we do not waste space. That means that the
+// system allocator really does need to handle the general problem of allocating
+// and freeing variable-sized chunks of memory in a random order, like malloc/
+// free do. And so it makes sense to layer mimalloc on top of such an
+// implementation.
+//
+// emmalloc makes sense for the lower level because it is small and simple while
+// still fully handling merging of holes etc. It is not the most efficient
+// allocator, but our assumption is that mimalloc needs to be fast while the
+// system allocator underneath it is called much less frequently.
+//
+
+//---------------------------------------------
+// init
+//---------------------------------------------
+
+void _mi_prim_mem_init( mi_os_mem_config_t* config) {
+  config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  config->alloc_granularity = 16;
+  config->has_overcommit = false;
+  config->must_free_whole = true;
+  config->has_virtual_reserve = false;
+}
+
+extern void emmalloc_free(void*);
+
+int _mi_prim_free(void* addr, size_t size) {
+  MI_UNUSED(size);
+  emmalloc_free(addr);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Allocation
+//---------------------------------------------
+
+extern void* emmalloc_memalign(size_t, size_t);
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) {
+  MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit);
+  *is_large = false;
+  // TODO: Track the highest address ever seen; first uses of it are zeroes.
+  //       That assumes no one else uses sbrk but us (they could go up,
+  //       scribble, and then down), but we could assert on that perhaps.
+  *is_zero = false;
+  // emmalloc has some limitations on alignment size.
+  // TODO: Why does mimalloc ask for an align of 4MB? that ends up allocating
+  //       8, which wastes quite a lot for us in wasm. If that is unavoidable,
+  //       we may want to improve emmalloc to support such alignment. See also
+  //       https://github.com/emscripten-core/emscripten/issues/20645
+  #define MIN_EMMALLOC_ALIGN           8
+  #define MAX_EMMALLOC_ALIGN (1024*1024)
+  if (try_alignment < MIN_EMMALLOC_ALIGN) {
+    try_alignment = MIN_EMMALLOC_ALIGN;
+  } else if (try_alignment > MAX_EMMALLOC_ALIGN) {
+    try_alignment = MAX_EMMALLOC_ALIGN;
+  }
+  void* p = emmalloc_memalign(try_alignment, size);
+  *addr = p;
+  if (p == 0) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+
+//---------------------------------------------
+// Commit/Reset
+//---------------------------------------------
+
+int _mi_prim_commit(void* addr, size_t size, bool* is_zero) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  // See TODO above.
+  *is_zero = false;
+  return 0;
+}
+
+int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  *needs_recommit = false;
+  return 0;
+}
+
+int _mi_prim_reset(void* addr, size_t size) {
+  MI_UNUSED(addr); MI_UNUSED(size);
+  return 0;
+}
+
+int _mi_prim_protect(void* addr, size_t size, bool protect) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect);
+  return 0;
+}
+
+
+//---------------------------------------------
+// Huge pages and NUMA nodes
+//---------------------------------------------
+
+int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) {
+  MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  *is_zero = true;
+  *addr = NULL;
+  return ENOSYS;
+}
+
+size_t _mi_prim_numa_node(void) {
+  return 0;
+}
+
+size_t _mi_prim_numa_node_count(void) {
+  return 1;
+}
+
+
+//----------------------------------------------------------------
+// Clock
+//----------------------------------------------------------------
+
+#include <emscripten/html5.h>
+
+mi_msecs_t _mi_prim_clock_now(void) {
+  return emscripten_date_now();
+}
+
+
+//----------------------------------------------------------------
+// Process info
+//----------------------------------------------------------------
+
+void _mi_prim_process_info(mi_process_info_t* pinfo)
+{
+  // use defaults
+  MI_UNUSED(pinfo);
+}
+
+
+//----------------------------------------------------------------
+// Output
+//----------------------------------------------------------------
+
+#include <emscripten/console.h>
+
+void _mi_prim_out_stderr( const char* msg) {
+  emscripten_console_error(msg);
+}
+
+
+//----------------------------------------------------------------
+// Environment
+//----------------------------------------------------------------
+
+bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
+  // For code size reasons, do not support environ customization for now.
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+
+
+//----------------------------------------------------------------
+// Random
+//----------------------------------------------------------------
+
+bool _mi_prim_random_buf(void* buf, size_t buf_len) {
+  int err = getentropy(buf, buf_len);
+  return !err;
+}
+
+
+//----------------------------------------------------------------
+// Thread init/done
+//----------------------------------------------------------------
+
+#ifdef __EMSCRIPTEN_SHARED_MEMORY__
+
+// use pthread local storage keys to detect thread ending
+// (and used with MI_TLS_PTHREADS for the default heap)
+pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+
+static void mi_pthread_done(void* value) {
+  if (value!=NULL) {
+    _mi_thread_done((mi_heap_t*)value);
+  }
+}
+
+void _mi_prim_thread_init_auto_done(void) {
+  mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+  pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing to do
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+}
+
+#else
+
+void _mi_prim_thread_init_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_done_auto_done(void) {
+  // nothing
+}
+
+void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {
+  MI_UNUSED(heap);
+
+}
+#endif
diff --git a/src/prim/prim.c b/src/prim/prim.c
index 9a597d8e..3b7d3736 100644
--- a/src/prim/prim.c
+++ b/src/prim/prim.c
@@ -18,6 +18,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_USE_SBRK
 #include "wasi/prim.c"     // memory-grow or sbrk (Wasm)
 
+#elif defined(__EMSCRIPTEN__)
+#include "emscripten/prim.c" // emmalloc_*, + pthread support
+
 #else
 #include "unix/prim.c"     // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.)
 

From 98abfe042cbb168309832b744bbed982d81bba6b Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:08:22 -0800
Subject: [PATCH 09/17] avoid syscall on openBSD, issue #821 by @blackgnezdo

---
 src/prim/unix/prim.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 91fa6508..5a1088e0 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -27,10 +27,10 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
-
+#include <fcntl.h>     // open, close, read, access
+  
 #if defined(__linux__)
   #include <features.h>
-  #include <fcntl.h>
   #if defined(__GLIBC__)
   #include <linux/mman.h> // linux mmap flags
   #else
@@ -51,7 +51,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__)
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
@@ -81,8 +81,6 @@ static int mi_prim_access(const char *fpath, int mode) {
 
 #elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
 
-#include <fcntl.h>
-
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);
 }
@@ -761,7 +759,6 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) {
 
 #include <sys/types.h>
 #include <sys/stat.h>
-#include <fcntl.h>
 #include <errno.h>
 
 bool _mi_prim_random_buf(void* buf, size_t buf_len) {

From 5634527fae4827d1ed470e093c2bac403cf82aaf Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:26:42 -0800
Subject: [PATCH 10/17] add terminating quote

---
 include/mimalloc/prim.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 830b36c4..1a6adcc7 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -287,7 +287,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void);
 
 #if defined(MI_TLS_SLOT)
 # if !defined(MI_HAS_TLS_SLOT)
-#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined
+#  error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined"
 # endif
 
 static inline mi_heap_t* mi_prim_get_default_heap(void) {

From b7d44378bb2840dcad4db5d921e552ddbc960361 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:32:35 -0800
Subject: [PATCH 11/17] avoid unused function warning on Solaris, PR #830 by
 @kulikjak

---
 src/prim/unix/prim.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 5a1088e0..94ae7946 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -51,7 +51,7 @@ terms of the MIT license. A copy of the license can be found in the file
   #include <sys/sysctl.h>
 #endif
 
-#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__)
+#if !defined(__HAIKU__) && !defined(__APPLE__) && !defined(__CYGWIN__) && !defined(__OpenBSD__) && !defined(__sun)
   #define MI_HAS_SYSCALL_H
   #include <sys/syscall.h>
 #endif
@@ -79,7 +79,7 @@ static int mi_prim_access(const char *fpath, int mode) {
   return syscall(SYS_access,fpath,mode);
 }
 
-#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070)  // avoid unused warnings on macOS
+#elif (!defined(__APPLE__) || MAC_OS_X_VERSION_MIN_REQUIRED < 1070) && !defined(__sun) // avoid unused warnings on macOS and Solaris
 
 static int mi_prim_open(const char* fpath, int open_flags) {
   return open(fpath,open_flags);

From cc4500a024cc315f368e971fab556b6c17ec14ed Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:36:57 -0800
Subject: [PATCH 12/17] ensure consistent types for template deduction, PR #834
 by @dg0yt

---
 src/arena.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arena.c b/src/arena.c
index 790b6ac1..09afd890 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -482,7 +482,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t
     // schedule decommit
     mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire);
     if (expire != 0) {
-      mi_atomic_addi64_acq_rel(&arena->purge_expire, delay/10);  // add smallish extra delay
+      mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10));  // add smallish extra delay
     }
     else {
       mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay);
@@ -526,7 +526,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi
   if (!force && expire > now) return false;
 
   // reset expire (if not already set concurrently)
-  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, 0);
+  mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0);
 
   // potential purges scheduled, walk through the bitmap
   bool any_purged = false;

From 944ec1ab8acfc38137f716d85ceb7f896a39f852 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:47:07 -0800
Subject: [PATCH 13/17] Fix error: cannot use 'throw' with exceptions disabled
 #815, by @sergio-nsk

---
 src/alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/alloc.c b/src/alloc.c
index b17fdbdc..484a3e5b 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -908,9 +908,13 @@ static bool mi_try_new_handler(bool nothrow) {
   #endif
   if (h==NULL) {
     _mi_error_message(ENOMEM, "out of memory in 'new'");
+    #if defined(_CPPUNWIND) || defined(__cpp_exceptions)  // exceptions are not always enabled
     if (!nothrow) {
       throw std::bad_alloc();
     }
+    #else
+    MI_UNUSED(nothrow);
+    #endif
     return false;
   }
   else {

From 7b398ad9244769974cc63d88c5e2dc5525fa02a2 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:51:51 -0800
Subject: [PATCH 14/17] delete pthread key at shutdown, PR #810 by
 @jkriegshauser

---
 src/prim/unix/prim.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 94ae7946..4490c058 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -829,7 +829,9 @@ void _mi_prim_thread_init_auto_done(void) {
 }
 
 void _mi_prim_thread_done_auto_done(void) {
-  // nothing to do
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // do not leak the key, see issue #809
+    pthread_key_delete(_mi_heap_default_key);
+  }
 }
 
 void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) {

From 3fe3d540b67f69d4bb12bfa88c993b12af0da424 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 15:57:54 -0800
Subject: [PATCH 15/17] Fix incorrect MAP_HUGE_1GB check #793

---
 src/prim/unix/prim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 4490c058..3e159e6c 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -279,7 +279,7 @@ static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protec
         *is_large = true;
         p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd);
         #ifdef MAP_HUGE_1GB
-        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+        if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) {
           mi_huge_pages_available = false; // don't try huge 1GiB pages again
           _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno);
           lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);

From 5d22157dc8711e547608c7f31f1560b7e814d6c4 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:41:49 -0800
Subject: [PATCH 16/17] support tls_slot for PPC #781, by @barracuda156

---
 include/mimalloc/prim.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 1a6adcc7..d14b885b 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -134,7 +134,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 // However, we do still use it with older clang compilers and Apple OS (as we use TLS slot for the default heap there).
 #if defined(__GNUC__) && ( \
            (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
-        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+        || (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \
         || (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
         || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
         || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
@@ -165,6 +165,9 @@ static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept {
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     res = tcb[slot];
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    res = pthread_getspecific(slot);
   #endif
   return res;
 }
@@ -192,6 +195,9 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce
     __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
     #endif
     tcb[slot] = value;
+  #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781
+    MI_UNUSED(ofs);
+    pthread_setspecific(slot, value);    
   #endif
 }
 

From c541a9b32e1dce320e5636c925eb6eb799a21da7 Mon Sep 17 00:00:00 2001
From: Daan <daanl@outlook.com>
Date: Sat, 2 Mar 2024 16:44:06 -0800
Subject: [PATCH 17/17] Revert setting hardcoded install paths on Haiku #788,
 by @begasus

---
 CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd98019b..1a483ecd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,10 +263,11 @@ if(MI_USE_CXX)
   endif()
 endif()
 
-if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
-   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
-   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
- endif()
+# On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788
+# if(CMAKE_SYSTEM_NAME MATCHES "Haiku")
+#   SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib)
+#   SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers)
+# endif()
 
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")