various improvements

2025-09-17 19:54:47 +03:00 · 2024-12-09 14:31:43 -08:00 · 2024-12-09 14:31:43 -08:00 · d5ed0cc71e
commit d5ed0cc71e
parent 68ac94c1ba
10 changed files with 223 additions and 152 deletions
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
 #define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
 #define mi_atomic_cas_weak_relaxed(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 #define mi_atomic_cas_strong_relaxed(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
    unsigned long i;
    return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
  #else
-    return (x!=0 ? (*idx = mi_ctz(x), true) : false);    
+    return (x!=0 ? (*idx = mi_ctz(x), true) : false);
  #endif
 }
@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) {
  #endif
 }
 static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
  #if mi_has_builtin(rotateleft32)
    return mi_builtin(rotateleft32)(x,r);
  #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
    return _lrotl(x, (int)r);
  #else
    // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
    // avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
    const unsigned int rshift = (unsigned int)(r) & 31;
    return ((x << rshift) | (x >> ((-rshift) & 31)));
  #endif
 }
 #endif // MI_BITS_H
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -334,9 +334,9 @@ typedef struct mi_page_s {
 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
-#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)   // ~16KiB
+#define MI_SMALL_MAX_OBJ_SIZE             ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8)   // < 8 KiB
-#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // ~128KiB
+#define MI_MEDIUM_MAX_OBJ_SIZE            ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4)  // < 128 KiB
-#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // ~2MiB
+#define MI_LARGE_MAX_OBJ_SIZE             ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2)   // < 2 MiB
 #define MI_LARGE_MAX_OBJ_WSIZE            (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
--- a/src/arena.c
+++ b/src/arena.c
@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
 ----------------------------------------------------------- */
 #define MI_ARENA_BIN_COUNT      (MI_BIN_COUNT)
-
+#define MI_ARENA_MIN_SIZE       (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE)           // 32 MiB (or 8 MiB on 32-bit)
 #define MI_ARENA_MAX_SIZE       (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
 // A memory arena descriptor
 typedef struct mi_arena_s {
@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) {
 mi_arena_t* mi_arena_from_index(size_t idx) {
  mi_assert_internal(idx < mi_arena_get_count());
-  return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
+  return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
 }
 mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
        }
      }
    }
    if (memid->initially_zero) {
      mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
    }
    else {
      mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
    }
  }
  else {
    // no need to commit, but check if already fully committed
@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
 // try to reserve a fresh arena space
 static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
 {
-  if (_mi_preloading()) return false;  // use OS only while pre loading
+  // if (_mi_preloading()) return false;  // use OS only while pre loading
  if (req_arena_id != _mi_arena_id_none()) return false;
  const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
  arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
  if (arena_count >= 1 && arena_count <= 128) {
-    // scale up the arena sizes exponentially every 8 entries
+    // scale up the arena sizes exponentially every 4 entries
-    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
+    const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
    size_t reserve = 0;
    if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
      arena_reserve = reserve;
@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
  }
  // check arena bounds
-  const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps?
+  const size_t min_reserve = MI_ARENA_MIN_SIZE;   
-  const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE;  // 16 GiB
+  const size_t max_reserve = MI_ARENA_MAX_SIZE;   // 16 GiB
  if (arena_reserve < min_reserve) {
    arena_reserve = min_reserve;
  }
@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
  if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
  else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
-  return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
+  // and try to reserve the arena
  int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); 
  if (err != 0) {
    // failed, try a smaller size?
    const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
    if (arena_reserve > small_arena_reserve) {
      // try again
      err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
    }
  }
  return (err==0);
 }
@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
 #define mi_forall_arenas(req_arena_id, tseq, name_arena) \
  { \
-  const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
+  const size_t _arena_count = mi_arena_get_count(); \
  if (_arena_count > 0) { \
    const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
    size_t _start; \
    if (req_arena_id == _mi_arena_id_none()) { \
-       /* always start searching in an arena 1 below the max */ \
+       /* always start searching in the arena's below the max */ \
      _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
    } \
    else { \
@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
      size_t _idx; \
      if (_i < _arena_cycle) { \
        _idx = _i + _start; \
-        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
+        if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
      } \
      else { \
-        _idx = _i; \
+        _idx = _i; /* remaining arena's */ \
      } \
      mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
      if (name_arena != NULL) \
@ -397,6 +414,9 @@ again:
  // did we need a specific arena?
  if (req_arena_id != _mi_arena_id_none()) return NULL;
  // don't create arena's while preloading (todo: or should we?)
  if (_mi_preloading()) return NULL;
  // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
  if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
    mi_arena_id_t arena_id = 0;
@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
 // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
 // for dynamic libraries that are unloaded and need to release all their allocated memory.
 static void mi_arenas_unsafe_destroy(void) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
  size_t new_max_arena = 0;
  for (size_t i = 0; i < max_arena; i++) {
    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) {
 // Is a pointer inside any of our arenas?
 bool _mi_arena_contains(const void* p) {
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
  for (size_t i = 0; i < max_arena; i++) {
    mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
    if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
 void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
  MI_UNUSED(show_abandoned);
-  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  size_t max_arenas = mi_arena_get_count();
  size_t free_total = 0;
  size_t slice_total = 0;
  //size_t abandoned_total = 0;
@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
 static void mi_arenas_try_purge(bool force, bool visit_all) {
  if (_mi_preloading() || mi_arena_purge_delay() <= 0) return;  // nothing will be scheduled
-  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t max_arena = mi_arena_get_count();
  if (max_arena == 0) return;
  // _mi_error_message(EFAULT, "purging not yet implemented\n");
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically
 #include "mimalloc/bits.h"
 #include "bitmap.h"
 #define MI_USE_SIMD   0
 /* --------------------------------------------------------------------------------
  bfields
 -------------------------------------------------------------------------------- */
@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
  return mi_bsf(x,idx);
 }
-static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
+//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
-  return mi_rotr(x,r);
+//  return mi_rotr(x,r);
-}
+//}
 static inline mi_bfield_t mi_bfield_zero(void) {
  return 0;
@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
 // ------- mi_bchunk_try_find_and_clear ---------------------------------------
-#if defined(__AVX2__)
+#if MI_USE_SIMD && defined(__AVX2__) 
 static inline __m256i mi_mm256_zero(void) {
  return _mm256_setzero_si256();
 }
@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
 }
 #endif
 static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
  mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
  size_t cidx;
  if (!allow_allset && (~b == 0)) return false;
  if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
    if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
      *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
      mi_assert_internal(*pidx < MI_BCHUNK_BITS);
      return true;
    }
  }
  return false;
 }
 // Find least 1-bit in a chunk and try to clear it atomically
 // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // This is used to find free slices and abandoned pages and should be efficient.
 // todo: try neon version
 static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
  while (true) {
    const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
    const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF  : 0)
@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
    if (mask==0) return false;
    mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
    const size_t chunk_idx = _tzcnt_u32(mask) / 8;
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;    
    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
    size_t cidx;
    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the least bit
      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
        return true;
      }
    }
    // try again
  }
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
  while (true) {
    size_t chunk_idx = 0;
    #if 0
@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
    mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
    chunk_idx = _tzcnt_u64(mask) / 8;
    #endif
-    mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+    if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
    size_t cidx;
    if (mi_bfield_find_least_bit(b, &cidx)) {           // find the bit-idx that is clear
      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) {  // clear it atomically
        *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
        return true;
      }
    }
    // try again
  }
  #else
  // try first to find a field that is not all set (to reduce fragmentation)
  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-    const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
-    size_t idx;
+  }
-    if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
+  for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) {  // try to clear it atomically
+    if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
        *pidx = (i*MI_BFIELD_BITS + idx);
        mi_assert_internal(*pidx < MI_BCHUNK_BITS);
        return true;
      }
    }
  }
  return false;
  #endif
 }
 static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
  const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
  if (!allow_all_set && (~b == 0)) return false;
  // has_set8 has low bit in each byte set if the byte in x == 0xFF
  const mi_bfield_t has_set8 = 
    ((~b - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
     (b  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
     >> 7;                           // shift high bit to low bit
  size_t idx;
  if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
    mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
    mi_assert_internal((idx%8)==0);
    const size_t byte_idx = idx/8;
    if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) {  // unset the byte atomically
      *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
      mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
      return true;
    }
  }
  return false;
 }
 // find least byte in a chunk with all bits set, and try unset it atomically
 // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
 // Used to find medium size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
  while (true) {
    // since a cache-line is 64b, load all at once
    const __m256i vec1  = _mm256_load_si256((const __m256i*)chunk->bfields);
@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
    // try again
  }
  #else
    // first skip allset fields to reduce fragmentation
    for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-      const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
-      // has_set8 has low bit in each byte set if the byte in x == 0xFF
+    }
-      const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) &      // high bit set if byte in x is 0xFF or < 0x7F
+    for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
-                                    (x  & MI_BFIELD_HI_BIT8))       // high bit set if byte in x is >= 0x80
+      if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
                                    >> 7;                           // shift high bit to low bit
      size_t idx;
      if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
        mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
        mi_assert_internal((idx%8)==0);
        const size_t byte_idx = idx/8;
        if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) {  // unset the byte atomically
          *pidx = (i*MI_BFIELD_BITS) + idx;
          mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
          return true;
        }
        // else continue
      }
    }
    return false;
  #endif
@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
 // Used to find large size pages in the free blocks.
 // todo: try neon version
 static mi_decl_noinline  bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
  while (true) {
    // since a cache-line is 64b, load all at once
    const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
 }
-static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
-  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
+//  if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx);         // small pages
-  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
+//  if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx);        // medium pages
-  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
+//  if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx);  // large pages
-  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
+//  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
-  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+//  if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
-  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
+//  return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
-}
+//}
 // ------- mi_bchunk_clear_once_set ---------------------------------------
@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
 // are all bits in a bitmap chunk clear?
 static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
-  #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
+  #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
  const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
  return mi_mm256_is_zero(vec);
-  #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+  #elif MI_USE_SIMD &&  defined(__AVX2__) && (MI_BCHUNK_BITS==512)
  // a 64b cache-line contains the entire chunk anyway so load both at once
  const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
  const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
 bitmap chunkmap
 -------------------------------------------------------------------------------- */
 static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
  if mi_unlikely(chunk_idx > oldmax) {
    mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
  }
 }
 static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
  mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
  mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
 }
 static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
    mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
    return false;
  }
-  // record the max clear
+  mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
  do {
    if mi_likely(chunk_idx <= oldmax) break;
  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
  return true;
 }
@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
    mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
    mi_bitmap_chunkmap_set(bitmap, chunk_idx);
  }
  // reset max_accessed
  mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
 }
@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
  { \
  /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
  MI_UNUSED(tseq); \
-  const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
+  const size_t chunk_max_acc       = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
-  const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \
+  const size_t chunk_start         = tseq % chunk_max_acc; /* space out threads? */ \
-  const size_t chunkmap_hi_bfield  = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
+  const size_t chunkmap_max        = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
-  const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
+  const size_t chunkmap_max_acc    = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
-  const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
+  const size_t chunkmap_start      = chunk_start / MI_BFIELD_BITS; \
  /* for each chunkmap entry `i` */ \
-  for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
+  for (size_t _i = 0; _i < chunkmap_max; _i++) { \
    size_t i; \
-    if (_i < chunkmap_hi_bfield) { \
+    if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
-      i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \
+      i = _i + chunkmap_start;  \
-      if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \
+      if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
    } \
-    else { i = _i;  }  /* the rest of the chunks above chunk_hi_idx */ \
+    else { i = _i;  }  /* the rest of the chunks above chunk_max_accessed */ \
    const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
    mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
-    size_t      cmap_idx_shift = 0;   /* shift through the cmap */ \
+    /* todo: space out threads within a chunkmap (2GiB) as well? */ \
-    if (_i == 0 && chunkmap_start_idx > 0) { \
+    size_t cmap_idx_shift = 0;   /* shift through the cmap */ \
      cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
      cmap_idx_shift = chunkmap_start_idx; \
    } \
    size_t cmap_idx; \
    while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
      /* set the chunk idx */ \
      size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
      mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
      /* try to find and clear N bits in that chunk */ \
      {
@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
    } \
  }}
-// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
+
-// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
+#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
-// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
+  mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
+    size_t _cidx; \
-{
+    if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
-  // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+      *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
-  mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
+      return true; \
-  {
+    } \
-    size_t cidx;
+    else { \
-    if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+      /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
-      *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
+      mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
-      mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
+    } \
-      return true;
+  } \
-    }
+  mi_bitmap_forall_chunks_end(); \
-    else {
+  return false; \
-      // we may find that all are cleared only on a second iteration but that is ok as
+}
-      // the chunkmap is a conservative approximation.
+
-      mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
+#define COMMA ,
-    }
+
-  }
+mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
-  mi_bitmap_forall_chunks_end();
+  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
-  return false;
+}
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
 }
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
 }
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
  mi_assert_internal(n<=MI_BFIELD_BITS);
  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
 }
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
  mi_assert_internal(n<=MI_BCHUNK_BITS);
  mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
 }
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t;
 // An atomic bitmap
 typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
-  _Atomic(size_t)  chunk_count;      // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
+  _Atomic(size_t)  chunk_count;         // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
-  _Atomic(size_t)  chunk_max_clear;  // max chunk index that was once cleared
+  _Atomic(size_t)  chunk_max_accessed;  // max chunk index that was once cleared or set
  size_t           _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2];    // suppress warning on msvc
  mi_bchunkmap_t   chunkmap;
  mi_bchunk_t      chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT];        // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
 }
 // Specialized versions for common bit sequence sizes
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx);  // 1-bit
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
 // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
+mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
  if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx);               // small pages
  if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx);              // medium pages
  if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
  if (n == 0 || n > MI_BCHUNK_BITS) return false;  // cannot be more than a chunk
  if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
  return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
 }
 // Called once a bit is cleared to see if the memory slice can be claimed.
--- a/src/free.c
+++ b/src/free.c
@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
  }
  // 2. if the page is not too full, we can try to reclaim it for ourselves
  // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
  if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
-      !mi_page_is_used_at_frac(page,8))
+      !mi_page_is_used_at_frac(page,4) 
      // && !mi_page_is_abandoned_mapped(page)
     )
  {
    // the page has still some blocks in use (but not too many)
    // reclaim in our heap if compatible, or otherwise abandon again
@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
  }
  // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
-  if (!mi_page_is_used_at_frac(page, 4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
+  if (!mi_page_is_used_at_frac(page,4) &&  // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
    !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
    _mi_arena_page_try_reabandon_to_mapped(page))
  {
--- a/src/init.c
+++ b/src/init.c
@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
 // may lead to allocation itself on some platforms)
 // --------------------------------------------------------
-#define MI_MEMID_STATIC  {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
+#define MI_MEMID_STATIC  {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
 mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
  NULL,
--- a/src/os.c
+++ b/src/os.c
@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());
-  // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste).
+  // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
-  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64);
+  const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
  // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
  void* p = NULL;
  if (try_direct_alloc) {
    p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
      if (p == NULL) return NULL;
      // set p to the aligned part in the full region
-      // note: this is dangerous on Windows as VirtualFree needs the actual base pointer
+      // note: on Windows VirtualFree needs the actual base pointer
-      // this is handled though by having the `base` field in the memid's
+      // this is handledby having the `base` field in the memid.
      *base = p; // remember the base
      p = _mi_align_up_ptr(p, alignment);
@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
  if (newsize != NULL) *newsize = 0;
  if (size == 0 || addr == NULL) return NULL;
-  // page align conservatively within the range
+  // page align conservatively within the range, or liberally straddling pages outside the range
  void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
    : mi_align_down_ptr(addr, _mi_os_page_size()));
  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
    return needs_recommit;
  }
  else {
-    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed
+    if (allow_reset) {  // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
      _mi_os_reset(p, size);
    }
    return false;  // needs no recommit
--- a/src/random.c
+++ b/src/random.c
@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "mimalloc.h"
 #include "mimalloc/internal.h"
 #include "mimalloc/prim.h"    // _mi_prim_random_buf
 #include <string.h>       // memset
 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
 (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
 -----------------------------------------------------------------------------*/
 static inline uint32_t rotl(uint32_t x, uint32_t shift) {
  return (x << shift) | (x >> (32 - shift));
 }
 static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
-  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
+  x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
-  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+  x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
 }
 static void chacha_block(mi_random_ctx_t* ctx)
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
-  memset(ctx, 0, sizeof(*ctx));
+  _mi_memzero(ctx, sizeof(*ctx));
  for (size_t i = 0; i < 4; i++) {
    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
    ctx->input[i] = read32(sigma,i);
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
 }
 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
-  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memzero(ctx_new, sizeof(*ctx_new));
  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
  ctx_new->input[12] = 0;
  ctx_new->input[13] = 0;
@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
 uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
-  x ^= _mi_prim_clock_now();  
+  x ^= _mi_prim_clock_now();
  // and do a few randomization steps
  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
  for (uintptr_t i = 0; i < max; i++) {