From d5ed0cc71ef02b5ab986fa7ffc06b4c6e65dd622 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Mon, 9 Dec 2024 14:31:43 -0800
Subject: [PATCH] various improvements

---
 include/mimalloc/atomic.h |   3 +
 include/mimalloc/bits.h   |  15 ++-
 include/mimalloc/types.h  |   6 +-
 src/arena.c               |  52 ++++++---
 src/bitmap.c              | 238 +++++++++++++++++++++-----------------
 src/bitmap.h              |  20 +++-
 src/free.c                |   7 +-
 src/init.c                |   2 +-
 src/os.c                  |  13 +--
 src/random.c              |  19 ++-
 10 files changed, 223 insertions(+), 152 deletions(-)

diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index caa90cf8..3b0ff559 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index 3afac04d..e47d8a76 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
     unsigned long i;
     return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
   #else
-    return (x!=0 ? (*idx = mi_ctz(x), true) : false);    
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
   #endif
 }
 
@@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) {
   #endif
 }
 
+static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
+  #if mi_has_builtin(rotateleft32)
+    return mi_builtin(rotateleft32)(x,r);
+  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
+    return _lrotl(x, (int)r);
+  #else
+    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
+    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
+    const unsigned int rshift = (unsigned int)(r) & 31;
+    return ((x << rshift) | (x >> ((-rshift) & 31)));
+  #endif
+}
+
 
 #endif // MI_BITS_H
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d507ca69..71edb397 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -334,9 +334,9 @@ typedef struct mi_page_s {
 
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // ~16KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // ~128KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // ~2MiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
 
 
diff --git a/src/arena.c b/src/arena.c
index ab74b988..24835f42 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 ----------------------------------------------------------- */
 
 #define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
-
+#define MI_ARENA_MIN_SIZE       (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE)           // 32 MiB (or 8 MiB on 32-bit)
+#define MI_ARENA_MAX_SIZE       (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
 
 // A memory arena descriptor
 typedef struct mi_arena_s {
@@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) {
 
 mi_arena_t* mi_arena_from_index(size_t idx) {
   mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
 }
 
 mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
@@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
         }
       }
     }
+    if (memid->initially_zero) {
+      mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
+    else {
+      mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
+    }
   }
   else {
     // no need to commit, but check if already fully committed
@@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
-  if (_mi_preloading()) return false;  // use OS only while pre loading
+  // if (_mi_preloading()) return false;  // use OS only while pre loading
   if (req_arena_id != _mi_arena_id_none()) return false;
 
   const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
@@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
 
   if (arena_count >= 1 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    // scale up the arena sizes exponentially every 4 entries
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
     size_t reserve = 0;
     if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
       arena_reserve = reserve;
@@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   }
 
   // check arena bounds
-  const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps?
-  const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE;  // 16 GiB
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;   
+  const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
   if (arena_reserve < min_reserve) {
     arena_reserve = min_reserve;
   }
@@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
   if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
   else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
 
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+  // and try to reserve the arena
+  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); 
+  if (err != 0) {
+    // failed, try a smaller size?
+    const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
+    if (arena_reserve > small_arena_reserve) {
+      // try again
+      err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
+    }
+  }
+  return (err==0);
 }
 
 
@@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 
 #define mi_forall_arenas(req_arena_id, tseq, name_arena) \
   { \
-  const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
+  const size_t _arena_count = mi_arena_get_count(); \
   if (_arena_count > 0) { \
     const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
     size_t _start; \
     if (req_arena_id == _mi_arena_id_none()) { \
-       /* always start searching in an arena 1 below the max */ \
+       /* always start searching in the arena's below the max */ \
       _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
     } \
     else { \
@@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
       size_t _idx; \
       if (_i < _arena_cycle) { \
         _idx = _i + _start; \
-        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
       } \
       else { \
-        _idx = _i; \
+        _idx = _i; /* remaining arena's */ \
       } \
       mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
       if (name_arena != NULL) \
@@ -397,6 +414,9 @@ again:
   // did we need a specific arena?
   if (req_arena_id != _mi_arena_id_none()) return NULL;
 
+  // don't create arena's while preloading (todo: or should we?)
+  if (_mi_preloading()) return NULL;
+
   // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
   if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
     mi_arena_id_t arena_id = 0;
@@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   size_t new_max_arena = 0;
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
@@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) {
 
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   for (size_t i = 0; i < max_arena; i++) {
     mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
     if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
@@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
 
 void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
   MI_UNUSED(show_abandoned);
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t max_arenas = mi_arena_get_count();
   size_t free_total = 0;
   size_t slice_total = 0;
   //size_t abandoned_total = 0;
@@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all) {
   if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
 
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
   if (max_arena == 0) return;
 
   // _mi_error_message(EFAULT, "purging not yet implemented\n");
diff --git a/src/bitmap.c b/src/bitmap.c
index 45a82ba3..2f563066 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 
+#define MI_USE_SIMD   0
+
 /* --------------------------------------------------------------------------------
   bfields
 -------------------------------------------------------------------------------- */
@@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
   return mi_bsf(x,idx);
 }
 
-static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-  return mi_rotr(x,r);
-}
+//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+//  return mi_rotr(x,r);
+//}
 
 static inline mi_bfield_t mi_bfield_zero(void) {
   return 0;
@@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
 
 // ------- mi_bchunk_try_find_and_clear ---------------------------------------
 
-#if defined(__AVX2__)
+#if MI_USE_SIMD && defined(__AVX2__) 
 static inline __m256i mi_mm256_zero(void) {
   return _mm256_setzero_si256();
 }
@@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 }
 #endif
 
+static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
+  mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  size_t cidx;
+  if (!allow_allset && (~b == 0)) return false;
+  if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
+    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
+      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
+
 // Find least 1-bit in a chunk and try to clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
 static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   while (true) {
     const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
     const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
@@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     if (mask==0) return false;
     mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
     const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;    
     // try again
   }
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     size_t chunk_idx = 0;
     #if 0
@@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
     mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
     chunk_idx = _tzcnt_u64(mask) / 8;
     #endif
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
-    size_t cidx;
-    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the bit-idx that is clear
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
-        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
     // try again
   }
   #else
+  // try first to find a field that is not all set (to reduce fragmentation)
   for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
-    size_t idx;
-    if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) {  // try to clear it atomically
-        *pidx = (i*MI_BFIELD_BITS + idx);
-        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
-        return true;
-      }
-    }
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
+  }
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
   }
   return false;
   #endif
 }
 
 
+static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
+  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
+  if (!allow_all_set && (~b == 0)) return false;
+  // has_set8 has low bit in each byte set if the byte in x == 0xFF
+  const mi_bfield_t has_set8 = 
+    ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+     (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+     >> 7;                           // shift high bit to low bit
+  size_t idx;
+  if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
+    mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
+    mi_assert_internal((idx%8)==0);
+    const size_t byte_idx = idx/8;
+    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // unset the byte atomically
+      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
+      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
+      return true;
+    }
+  }
+  return false;
+}
 
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
     // try again
   }
   #else
+    // first skip allset fields to reduce fragmentation
     for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
-      // has_set8 has low bit in each byte set if the byte in x == 0xFF
-      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
-                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
-                                    >> 7;                           // shift high bit to low bit
-      size_t idx;
-      if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
-        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
-        mi_assert_internal((idx%8)==0);
-        const size_t byte_idx = idx/8;
-        if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) {  // unset the byte atomically
-          *pidx = (i*MI_BFIELD_BITS) + idx;
-          mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
-          return true;
-        }
-        // else continue
-      }
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
+    }
+    for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
     }
     return false;
   #endif
@@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
 // Used to find large size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   while (true) {
     // since a cache-line is 64b, load all at once
     const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
 }
 
 
-static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
-  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
-  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
-  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
-}
+//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+//  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
+//  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
+//  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
+//  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+//  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+//  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
+//}
 
 
 // ------- mi_bchunk_clear_once_set ---------------------------------------
@@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
 
 // are all bits in a bitmap chunk clear?
 static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
   const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
   return mi_mm256_is_zero(vec);
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
   // a 64b cache-line contains the entire chunk anyway so load both at once
   const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
   const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
@@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
  bitmap chunkmap
 -------------------------------------------------------------------------------- */
 
+static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
+  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
+  if mi_unlikely(chunk_idx > oldmax) {
+    mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
+  }
+}
+
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
   mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
   mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
+  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
 }
 
 static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
     return false;
   }
-  // record the max clear
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
-  do {
-    if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
+  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
   return true;
 }
 
@@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
     mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
     mi_bitmap_chunkmap_set(bitmap, chunk_idx);
   }
+
+  // reset max_accessed
+  mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
 }
 
 
@@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
-  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \
-  const size_t chunkmap_hi_bfield  = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
-  const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
-  const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
+  const size_t chunk_max_acc       = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
+  const size_t chunk_start         = tseq % chunk_max_acc; /* space out threads? */ \
+  const size_t chunkmap_max        = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
+  const size_t chunkmap_max_acc    = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
+  const size_t chunkmap_start      = chunk_start / MI_BFIELD_BITS; \
   /* for each chunkmap entry `i` */ \
-  for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
+  for (size_t _i = 0; _i < chunkmap_max; _i++) { \
     size_t i; \
-    if (_i < chunkmap_hi_bfield) { \
-      i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \
-      if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \
+    if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
+      i = _i + chunkmap_start;  \
+      if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
     } \
-    else { i = _i;  }  /* the rest of the chunks above chunk_hi_idx */ \
+    else { i = _i;  }  /* the rest of the chunks above chunk_max_accessed */ \
     const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
     mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
-    size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
-    if (_i == 0 && chunkmap_start_idx > 0) { \
-      cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
-      cmap_idx_shift = chunkmap_start_idx; \
-    } \
+    /* todo: space out threads within a chunkmap (2GiB) as well? */ \
+    size_t cmap_idx_shift = 0;   /* shift through the cmap */ \
     size_t cmap_idx; \
     while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
       /* set the chunk idx */ \
       size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
-      mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
       /* try to find and clear N bits in that chunk */ \
       {
 
@@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
     } \
   }}
 
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
-{
-  // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
-  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
-  {
-    size_t cidx;
-    if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
-      *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
-      mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
-      return true;
-    }
-    else {
-      // we may find that all are cleared only on a second iteration but that is ok as
-      // the chunkmap is a conservative approximation.
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
-    }
-  }
-  mi_bitmap_forall_chunks_end();
-  return false;
+
+#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
+  mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
+    size_t _cidx; \
+    if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
+      *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
+      return true; \
+    } \
+    else { \
+      /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
+      mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
+    } \
+  } \
+  mi_bitmap_forall_chunks_end(); \
+  return false; \
+}
+
+#define COMMA ,
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BFIELD_BITS);
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
+}
+
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
+  mi_assert_internal(n<=MI_BCHUNK_BITS);
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
 }
 
 
diff --git a/src/bitmap.h b/src/bitmap.h
index 40c4df42..b26791cc 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)  chunk_count;      // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
   size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
   mi_bchunkmap_t   chunkmap;
   mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
@@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 
 
+// Specialized versions for common bit sequence sizes
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
+
 // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
+mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
+  if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
+  if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
+  if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
+  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+  if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
+  return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
+}
 
 
 // Called once a bit is cleared to see if the memory slice can be claimed.
diff --git a/src/free.c b/src/free.c
index d45507e7..0da0332e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   }
 
   // 2. if the page is not too full, we can try to reclaim it for ourselves
+  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
   if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
-      !mi_page_is_used_at_frac(page,8))
+      !mi_page_is_used_at_frac(page,4) 
+      // && !mi_page_is_abandoned_mapped(page)
+     )
   {
     // the page has still some blocks in use (but not too many)
     // reclaim in our heap if compatible, or otherwise abandon again
@@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
   }
 
   // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!mi_page_is_used_at_frac(page, 4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!mi_page_is_used_at_frac(page,4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
     !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
     _mi_arena_page_try_reabandon_to_mapped(page))
   {
diff --git a/src/init.c b/src/init.c
index 2396f594..2070405d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
 
-#define MI_MEMID_STATIC  {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
+#define MI_MEMID_STATIC  {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
 
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
   NULL,
diff --git a/src/os.c b/src/os.c
index b913fb1c..55f7428e 100644
--- a/src/os.c
+++ b/src/os.c
@@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
   if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
   size = _mi_align_up(size, _mi_os_page_size());
 
-  // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste).
-  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64);
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
 
-  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
   void* p = NULL;
   if (try_direct_alloc) {
     p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
@@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
       if (p == NULL) return NULL;
 
       // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // note: on Windows VirtualFree needs the actual base pointer
+      // this is handledby having the `base` field in the memid.
       *base = p; // remember the base
       p = _mi_align_up_ptr(p, alignment);
 
@@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
   if (newsize != NULL) *newsize = 0;
   if (size == 0 || addr == NULL) return NULL;
 
-  // page align conservatively within the range
+  // page align conservatively within the range, or liberally straddling pages outside the range
   void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
     : mi_align_down_ptr(addr, _mi_os_page_size()));
   void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
@@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
     return needs_recommit;
   }
   else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
       _mi_os_reset(p, size);
     }
     return false;  // needs no recommit
diff --git a/src/random.c b/src/random.c
index 4fc8b2f8..35e2718a 100644
--- a/src/random.c
+++ b/src/random.c
@@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
-#include <string.h>       // memset
 
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 
-static inline uint32_t rotl(uint32_t x, uint32_t shift) {
-  return (x << shift) | (x >> (32 - shift));
-}
-
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 
 static void chacha_block(mi_random_ctx_t* ctx)
@@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
   // since we only use chacha for randomness (and not encryption) we
   // do not _need_ to read 32-bit values as little endian but we do anyways
   // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
   for (size_t i = 0; i < 4; i++) {
     const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
     ctx->input[i] = read32(sigma,i);
@@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
   _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
   ctx_new->input[12] = 0;
   ctx_new->input[13] = 0;
@@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
   uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
   // and do a few randomization steps
   uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
   for (uintptr_t i = 0; i < max; i++) {